Introduction¶

In this project, our goal was to predict stock prices using a machine learning approach. To achieve this, we designed and implemented a model based on a set of carefully chosen features. These features included technical indicators such as Relative Strength Index (RSI), Money Flow Index (MFI), Exponential Moving Averages (EMA), Simple Moving Average (SMA),Moving Average Convergence Divergence (MACD) as well as historical price data encompassing the previous 1 day, 3 days, 5 days, and 1, 2, 3, 4 weeks. Additionally, rolling average values for high, low, open, close, adjusted close, and volume were incorporated.

Import Libraries¶

In [1]:
import os
import time
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import plot_importance, plot_tree
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor 
from catboost import CatBoostRegressor


from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
pd.set_option('display.max_columns', None)


# Chart drawing
import plotly as py
import plotly.io as pio
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

# Mute sklearn warnings
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
simplefilter(action='ignore', category=DeprecationWarning)

# Show charts when running kernel
#init_notebook_mode(connected=True)

# Change default background color for all visualizations
layout=go.Layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(250,250,250,0.8)')
fig = go.Figure(layout=layout)
templated_fig = pio.to_templated(fig)
pio.templates['my_template'] = templated_fig.layout.template
pio.templates.default = 'my_template'

import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="xgboost")
warnings.filterwarnings("ignore")

Functions¶

In [2]:
def evaluate_regression_model(y_true, y_pred):
    """
    Calculate and print evaluation metrics for a regression model.

    Parameters:
    - y_true: Actual values.
    - y_pred: Predicted values.

    Returns:
    - Dictionary containing the evaluation metrics.
    """
    # Calculate evaluation metrics
    mse = mean_squared_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    # Print the results
    print(f'Mean Squared Error (MSE): {np.round(mse,3)}')
    print(f'Root Mean Squared Error (RMSE): {np.round(rmse,3)}')
    print(f'Mean Absolute Error (MAE): {np.round(mae,3)}')
    print(f'R-squared (R2): {np.round(r2,3)}')

    # Return results as a dictionary
    results = {
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'R2': r2
    }

    return results
In [3]:
def evaluate_regression_model2(y_true, y_pred):
    """
    Calculate and print evaluation metrics for a regression model.

    Parameters:
    - y_true: Actual values.
    - y_pred: Predicted values.

    Returns:
    - Dictionary containing the evaluation metrics.
    """
    # Calculate evaluation metrics
    mse = mean_squared_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

#     # Print the results
#     print(f'Mean Squared Error (MSE): {np.round(mse,3)}')
#     print(f'Root Mean Squared Error (RMSE): {np.round(rmse,3)}')
#     print(f'Mean Absolute Error (MAE): {np.round(mae,3)}')
#     print(f'R-squared (R2): {np.round(r2,3)}')

    # Return results as a dictionary
    results = {
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'R2': r2
    }

    return results
In [4]:
# Returns RSI values
def rsi(df, periods = 14):
    
    """
    Calculate the Relative Strength Index (RSI) for a given DataFrame.

    Parameters:
    - df (DataFrame): Pandas DataFrame with a 'close' column.
    - periods (int): Number of periods to consider for RSI calculation. Default is 14.

    Returns:
    - Series: A pandas Series containing the RSI values.
    """
        
        
    close = df['close']
    close_delta = close.diff()

    # Make two series: one for lower closes and one for higher closes
    up = close_delta.clip(lower=0)
    down = -1 * close_delta.clip(upper=0)
    
    ma_up = up.ewm(com = periods - 1, adjust=True, min_periods = periods).mean()
    ma_down = down.ewm(com = periods - 1, adjust=True, min_periods = periods).mean()

    rsi = ma_up / ma_down
    rsi = 100 - (100/(1 + rsi))
    return rsi
In [5]:
def gain(x):
    return ((x > 0) * x).sum()


def loss(x):
    return ((x < 0) * x).sum()


def mfi(df, n=14):
    """
    Calculate the Money Flow Index (MFI) for a given DataFrame.

    Parameters:
    - df (DataFrame): Pandas DataFrame with 'high', 'low', 'close', and 'volume' columns.
    - n (int): Number of periods to use for the MFI calculation. Default is 14.

    Returns:
    - numpy.ndarray: An array containing the MFI values.
    """
    
    high = df['high']
    low = df['low']
    close = df['close']
    volume = df['volume']
    
    typical_price = (high + low + close) / 3
    money_flow = typical_price * volume
    mf_sign = np.where(typical_price > typical_price.shift(1), 1, -1)
    signed_mf = money_flow * mf_sign

    # Calculate gain and loss using vectorized operations
    positive_mf = np.where(signed_mf > 0, signed_mf, 0)
    negative_mf = np.where(signed_mf < 0, -signed_mf, 0)

    mf_avg_gain = pd.Series(positive_mf).rolling(n, min_periods=1).sum()
    mf_avg_loss = pd.Series(negative_mf).rolling(n, min_periods=1).sum()

    return (100 - 100 / (1 + mf_avg_gain / mf_avg_loss)).to_numpy()
In [6]:
def plot_regression_accuracy(y_true, y_pred):
    """
    Create various plots to evaluate the accuracy of a linear regression model.

    Parameters:
    - y_true: Actual values.
    - y_pred: Predicted values.
    """
    # Scatter Plot
    plt.scatter(y_true, y_pred)
    plt.xlabel('Actual Values')
    plt.ylabel('Predicted Values')
    plt.title('Scatter Plot of Actual vs Predicted Values')
    plt.show()

    # Residual Plot
    residuals = y_true - y_pred
    plt.scatter(y_pred, residuals)
    plt.axhline(y=0, color='r', linestyle='--')
    plt.xlabel('Predicted Values')
    plt.ylabel('Residuals')
    plt.title('Residual Plot')
    plt.show()

    # Distribution of Residuals
    sns.histplot(residuals, kde=True)
    plt.xlabel('Residuals')
    plt.ylabel('Frequency')
    plt.title('Distribution of Residuals')
    plt.show()

    # Predicted vs Actual Line
    plt.plot(y_true, y_true, linestyle='--', color='r', label='Perfect Fit')
    plt.scatter(y_true, y_pred)
    plt.xlabel('Actual Values')
    plt.ylabel('Predicted Values')
    plt.title('Predicted vs Actual Values with Perfect Fit Line')
    plt.legend()
    plt.show()
In [7]:
def plot_predictions(df,prediction):
    
    """
    Create a Plotly graph to compare actual values with predictions.

    Parameters:
    - df (DataFrame): A pandas DataFrame containing 'date' and 'close_1d_next' columns.
    - prediction (array-like): Predicted values corresponding to the test set.
    """
        
    plot_test_df= df[df.date.dt.year>=2020]
    plot_test_df['prediction'] = prediction

    fig = make_subplots(rows=2, cols=1)
    fig.add_trace(go.Scatter(x=df.date, y=df.close_1d_next,
                             name='Truth',
                             marker_color='LightSkyBlue'), row=1, col=1)

    fig.add_trace(go.Scatter(x=plot_test_df.date,
                             y=plot_test_df.prediction,
                             name='Prediction',
                             marker_color='MediumPurple'), row=1, col=1)
    
    # Add title and Y-axis title for the first subplot
    fig.update_layout(title_text='Train Data and Test Data', title_x=0.5, title_y=0.9)
    fig.update_yaxes(title_text='Prediction', row=1, col=1)

    fig.add_trace(go.Scatter(x=plot_test_df.date,
                             y=y_test,
                             name='Truth',
                             marker_color='LightSkyBlue',
                             showlegend=False), row=2, col=1)

    fig.add_trace(go.Scatter(x=plot_test_df.date,
                             y=prediction,
                             name='Prediction',
                             marker_color='MediumPurple',
                             showlegend=False), row=2, col=1)
    
    fig.update_yaxes(title_text='Prediction', row=2, col=1)

    fig.show()
In [8]:
def plot_feature_importance(model,X_train,top_features):

    """
    Plot the feature importance from a linear regression model and return a sorted DataFrame of feature importances.

    Parameters:
    - model: A trained linear regression model with a coef_ attribute.
    - X_train (DataFrame): The DataFrame used to train the model, for feature names.
    - num_top_features (int): Number of top features to display.

    Returns:
    - DataFrame: Sorted DataFrame with features and their importance.
    """
    
    # Get feature importance scores (coefficients)
    feature_importance = model.coef_

    # Create a DataFrame to store feature names and importance scores
    feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': np.abs(feature_importance)})

    # Sort features by importance
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False).reset_index(drop=True)

    top_features = top_features
    # Plot feature importance
    plt.figure(figsize=(20, 6))
    plt.barh(range(len(feature_importance_df[:top_features])), feature_importance_df[:top_features]['Importance'], align="center")
    plt.yticks(range(len(feature_importance_df[:top_features])), labels=feature_importance_df[:top_features]['Feature'])
    plt.ylabel("Features")
    plt.xlabel("Coefficient Magnitude")
    plt.title(f"Top {top_features} Feature Importance Values")
    plt.show()
    
    return feature_importance_df

Read Data¶

In [9]:
out_loc  = '/Users/isapocan/Desktop/LSU/data/'
# Define the file path for the parquet file
parquet_file_path = out_loc + "stock_1d.parquet"
In [10]:
try:
    # Read the Parquet file into a DataFrame
    df = pd.read_parquet(parquet_file_path)

    # Convert column names to lowercase for consistency
    df.columns = df.columns.str.lower()

    # Display the first few rows of the DataFrame
    display(df.head())

except Exception as e:
    print(f"An error occurred while reading the file: {e}")
date open high low close adj close volume symbol security gics sector gics sub-industry headquarters location date added cik founded
0 2013-01-02 94.190002 94.790001 93.959999 94.779999 67.895119 3206700.0 MMM 3M Industrials Industrial Conglomerates Saint Paul, Minnesota 1957-03-04 66740 1902
1 2013-01-03 94.339996 94.930000 94.129997 94.669998 67.816322 2704600.0 MMM 3M Industrials Industrial Conglomerates Saint Paul, Minnesota 1957-03-04 66740 1902
2 2013-01-04 94.790001 95.480003 94.540001 95.370003 68.317757 2704900.0 MMM 3M Industrials Industrial Conglomerates Saint Paul, Minnesota 1957-03-04 66740 1902
3 2013-01-07 95.019997 95.730003 94.760002 95.489998 68.403717 2745800.0 MMM 3M Industrials Industrial Conglomerates Saint Paul, Minnesota 1957-03-04 66740 1902
4 2013-01-08 95.169998 95.750000 95.099998 95.500000 68.410889 2655500.0 MMM 3M Industrials Industrial Conglomerates Saint Paul, Minnesota 1957-03-04 66740 1902

Select Stock¶

In [11]:
# Filter the DataFrame to include only rows where 'symbol' is 'MDLZ'
df = df[df['symbol']=='MDLZ']

# Display the first few rows and the shape of the filtered DataFrame
display(df.head())
display(df.shape)
date open high low close adj close volume symbol security gics sector gics sub-industry headquarters location date added cik founded
852843 2013-01-02 25.840000 26.690001 25.780001 26.670000 21.445908 17862400.0 MDLZ Mondelez International Consumer Staples Packaged Foods & Meats Chicago, Illinois 2012-10-02 1103982 2012
852844 2013-01-03 26.700001 26.770000 26.490000 26.639999 21.421791 9075500.0 MDLZ Mondelez International Consumer Staples Packaged Foods & Meats Chicago, Illinois 2012-10-02 1103982 2012
852845 2013-01-04 26.700001 26.830000 26.549999 26.740000 21.502203 7696000.0 MDLZ Mondelez International Consumer Staples Packaged Foods & Meats Chicago, Illinois 2012-10-02 1103982 2012
852846 2013-01-07 26.620001 26.740000 26.549999 26.660000 21.437866 7576200.0 MDLZ Mondelez International Consumer Staples Packaged Foods & Meats Chicago, Illinois 2012-10-02 1103982 2012
852847 2013-01-08 26.520000 26.920000 26.459999 26.680000 21.453959 14360800.0 MDLZ Mondelez International Consumer Staples Packaged Foods & Meats Chicago, Illinois 2012-10-02 1103982 2012
(2733, 15)

1. Feature engineering¶

1.a. Financial indicators¶

  1. Relative Strength Index (RSI):

Description: RSI helps you understand if a stock is likely to be overbought (prices too high) or oversold (prices too low). It looks at recent price changes to make this determination.

  1. Money Flow Index (MFI):

Description: MFI considers both price and trading volume to identify if a stock is overbought or oversold. It helps gauge the strength of buying and selling pressure.

  1. Exponential Moving Average (EMA):

Description: EMA smoothens out price data, giving more weight to recent prices. It reacts faster to price changes compared to a Simple Moving Average (SMA), making it useful for trend analysis.

  1. Simple Moving Average (SMA):

Description: SMA is a basic average of stock prices over a specific period. It provides a smoothed representation of the overall price trend, helping to identify general market direction.

  1. Moving Average Convergence Divergence (MACD):

Description: MACD is a trend-following momentum indicator that shows the relationship between two moving averages of a security's price. It helps identify potential trend reversals or momentum shifts.

  1. MACD Signal Line (MACD_signal):

Description: The MACD signal line is a nine-day EMA of the MACD. It is used to generate trading signals. When the MACD crosses above the signal line, it might be a signal to buy, and when it crosses below, it might be a signal to sell.

  1. Lag and Rolling Average values of high, low, open, close, adjusted close, and volume.
In [12]:
def add_moving_averages(df, column_name):
    """
    Adds various moving averages to the DataFrame.

    Parameters:
    - df (DataFrame): The DataFrame to modify.
    - column_name (str): The column name to calculate moving averages for.
    """
    # Exponential Moving Average (EMA)
    df['ema_9'] = df[column_name].ewm(span=9).mean().shift()

    # Simple Moving Averages (SMA) with different periods
    for period in [5, 10, 15, 30]:
        df[f'sma_{period}'] = df[column_name].rolling(window=period).mean().shift()

# Add moving averages for the 'close' column
add_moving_averages(df, 'close')

df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2733 entries, 852843 to 855575
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   date                   2733 non-null   datetime64[ns]
 1   open                   2733 non-null   float64       
 2   high                   2733 non-null   float64       
 3   low                    2733 non-null   float64       
 4   close                  2733 non-null   float64       
 5   adj close              2733 non-null   float64       
 6   volume                 2733 non-null   float64       
 7   symbol                 2733 non-null   object        
 8   security               2733 non-null   object        
 9   gics sector            2733 non-null   object        
 10  gics sub-industry      2733 non-null   object        
 11  headquarters location  2733 non-null   object        
 12  date added             2733 non-null   object        
 13  cik                    2733 non-null   int64         
 14  founded                2733 non-null   object        
 15  ema_9                  2732 non-null   float64       
 16  sma_5                  2728 non-null   float64       
 17  sma_10                 2723 non-null   float64       
 18  sma_15                 2718 non-null   float64       
 19  sma_30                 2703 non-null   float64       
dtypes: datetime64[ns](1), float64(11), int64(1), object(7)
memory usage: 448.4+ KB
In [13]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2733 entries, 852843 to 855575
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   date                   2733 non-null   datetime64[ns]
 1   open                   2733 non-null   float64       
 2   high                   2733 non-null   float64       
 3   low                    2733 non-null   float64       
 4   close                  2733 non-null   float64       
 5   adj close              2733 non-null   float64       
 6   volume                 2733 non-null   float64       
 7   symbol                 2733 non-null   object        
 8   security               2733 non-null   object        
 9   gics sector            2733 non-null   object        
 10  gics sub-industry      2733 non-null   object        
 11  headquarters location  2733 non-null   object        
 12  date added             2733 non-null   object        
 13  cik                    2733 non-null   int64         
 14  founded                2733 non-null   object        
 15  ema_9                  2732 non-null   float64       
 16  sma_5                  2728 non-null   float64       
 17  sma_10                 2723 non-null   float64       
 18  sma_15                 2718 non-null   float64       
 19  sma_30                 2703 non-null   float64       
dtypes: datetime64[ns](1), float64(11), int64(1), object(7)
memory usage: 448.4+ KB
In [14]:
# Add a Relative Strength Index (RSI) column to the DataFrame
try:
    df['rsi'] = rsi(df) # Uncomment and adjust fillna(0) if appropriate for handling missing values
except Exception as e:
    print(f"Error calculating RSI: {e}")

# Add a Money Flow Index (MFI) column to the DataFrame
try:
    df['mfi'] = mfi(df, 14) # The second argument is the period, here assumed to be 14
except Exception as e:
    print(f"Error calculating MFI: {e}")
In [15]:
df[['date','close','ema_9','sma_5','sma_10','sma_15','sma_30','rsi','mfi']]
Out[15]:
date close ema_9 sma_5 sma_10 sma_15 sma_30 rsi mfi
852843 2013-01-02 26.670000 NaN NaN NaN NaN NaN NaN 0.000000
852844 2013-01-03 26.639999 26.670000 NaN NaN NaN NaN NaN 33.904295
852845 2013-01-04 26.740000 26.653333 NaN NaN NaN NaN NaN 48.695375
852846 2013-01-07 26.660000 26.688852 NaN NaN NaN NaN NaN 39.919745
852847 2013-01-08 26.680000 26.679078 NaN NaN NaN NaN NaN 55.233142
... ... ... ... ... ... ... ... ... ...
855571 2023-11-02 67.970001 65.583879 65.934001 65.321001 64.406001 65.993000 60.257764 89.207420
855572 2023-11-03 68.820000 66.061103 66.398001 65.697001 64.868001 65.901667 63.726091 89.458580
855573 2023-11-06 68.239998 66.612883 67.160001 66.169001 65.354001 65.848000 59.885606 83.710782
855574 2023-11-07 68.489998 66.938306 67.612000 66.594001 65.728667 65.799000 60.977252 75.937617
855575 2023-11-08 69.019997 67.248644 68.067999 66.888000 66.058667 65.729667 63.259914 75.566164

2733 rows × 9 columns

In [16]:
# Calculate and display the correlation between 'rsi' and 'mfi'
if {'rsi', 'mfi'}.issubset(df.columns):
    correlation = df[['rsi', 'mfi']].corr()
    print(correlation)
else:
    print("DataFrame does not contain 'rsi' and/or 'mfi' columns.")
          rsi       mfi
rsi  1.000000  0.698958
mfi  0.698958  1.000000
In [17]:
# calculating the Moving Average Convergence Divergence (MACD) and its signal line

# Ensure the 'close' column exists in the DataFrame
if 'close' in df.columns:
    # Calculate the 12-period EMA of the closing prices
    df['macd'] = df['close'].ewm(span=12, min_periods=12).mean() - df['close'].ewm(span=26, min_periods=26).mean()

    # Calculate the 9-period EMA of the MACD values (signal line)
    df['macd_signal'] = df['macd'].ewm(span=9, min_periods=9).mean()
else:
    print("DataFrame does not contain 'close' column.")
In [18]:
# Check if the required columns exist in the DataFrame
if {'macd', 'macd_signal'}.issubset(df.columns):
    # Select rows where 'macd' and 'macd_signal' columns do not have missing values
    filtered_df = df[(~df['macd'].isna()) & (~df['macd_signal'].isna())]
    
    # Display the first few rows of these columns
    print(filtered_df[['macd', 'macd_signal']].head())
else:
    print("DataFrame does not contain 'macd' and/or 'macd_signal' columns.")
            macd  macd_signal
852876 -0.147786    -0.050945
852877 -0.175230    -0.078792
852878 -0.198438    -0.104970
852879 -0.235462    -0.132994
852880 -0.226841    -0.152855

1.b. Shift Close price in order to predict next day¶

In [19]:
# Check if the required columns exist in the DataFrame
if {'date', 'close'}.issubset(df.columns):
    # Create a new column representing the next day's closing price
    df['close_1d_next'] = df['close'].shift(-1)

    # Display the first few rows including 'date', 'close', and 'close_1d_next'
    print(df[['date', 'close', 'close_1d_next']].head())
else:
    print("DataFrame does not contain 'date' and/or 'close' columns.")
             date      close  close_1d_next
852843 2013-01-02  26.670000      26.639999
852844 2013-01-03  26.639999      26.740000
852845 2013-01-04  26.740000      26.660000
852846 2013-01-07  26.660000      26.680000
852847 2013-01-08  26.680000      27.049999

1.c. Add lag features¶

In [20]:
def add_lagged_features(df, column_name, lags):
    """
    Adds lagged features for a specified column in the DataFrame.

    Parameters:
    - df (DataFrame): The DataFrame to modify.
    - column_name (str): The column name to create lagged features for.
    - lags (list of int): The list of lag periods.
    """
    for lag in lags:
        df[f'{column_name}_{lag}d_ago'] = df[column_name].shift(lag)

def add_rolling_avg_features(df, column_name, windows):
    """
    Adds rolling average features for a specified column in the DataFrame.

    Parameters:
    - df (DataFrame): The DataFrame to modify.
    - column_name (str): The column name to create rolling average features for.
    - windows (list of int): The list of rolling window sizes.
    """
    for window in windows:
        df[f'{column_name}_{window}d_avg'] = df[column_name].rolling(window=window).mean()

# Define lag periods and rolling window sizes
lag_periods = [1, 3, 5, 7, 14, 21, 28]
rolling_windows = [3, 5, 7, 10, 15, 30]

# Columns to create features for
columns = ['close', 'adj close', 'open', 'high', 'low', 'volume']

# Add lagged and rolling average features for each column
for column in columns:
    add_lagged_features(df, column, lag_periods)
    add_rolling_avg_features(df, column, rolling_windows)

# View the DataFrame
df.head()
Out[20]:
date open high low close adj close volume symbol security gics sector gics sub-industry headquarters location date added cik founded ema_9 sma_5 sma_10 sma_15 sma_30 rsi mfi macd macd_signal close_1d_next close_1d_ago close_3d_ago close_5d_ago close_7d_ago close_14d_ago close_21d_ago close_28d_ago close_3d_avg close_5d_avg close_7d_avg close_10d_avg close_15d_avg close_30d_avg adj close_1d_ago adj close_3d_ago adj close_5d_ago adj close_7d_ago adj close_14d_ago adj close_21d_ago adj close_28d_ago adj close_3d_avg adj close_5d_avg adj close_7d_avg adj close_10d_avg adj close_15d_avg adj close_30d_avg open_1d_ago open_3d_ago open_5d_ago open_7d_ago open_14d_ago open_21d_ago open_28d_ago open_3d_avg open_5d_avg open_7d_avg open_10d_avg open_15d_avg open_30d_avg high_1d_ago high_3d_ago high_5d_ago high_7d_ago high_14d_ago high_21d_ago high_28d_ago high_3d_avg high_5d_avg high_7d_avg high_10d_avg high_15d_avg high_30d_avg low_1d_ago low_3d_ago low_5d_ago low_7d_ago low_14d_ago low_21d_ago low_28d_ago low_3d_avg low_5d_avg low_7d_avg low_10d_avg low_15d_avg low_30d_avg volume_1d_ago volume_3d_ago volume_5d_ago volume_7d_ago volume_14d_ago volume_21d_ago volume_28d_ago volume_3d_avg volume_5d_avg volume_7d_avg volume_10d_avg volume_15d_avg volume_30d_avg
852843 2013-01-02 25.840000 26.690001 25.780001 26.670000 21.445908 17862400.0 MDLZ Mondelez International Consumer Staples Packaged Foods & Meats Chicago, Illinois 2012-10-02 1103982 2012 NaN NaN NaN NaN NaN NaN 0.000000 NaN NaN 26.639999 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
852844 2013-01-03 26.700001 26.770000 26.490000 26.639999 21.421791 9075500.0 MDLZ Mondelez International Consumer Staples Packaged Foods & Meats Chicago, Illinois 2012-10-02 1103982 2012 26.670000 NaN NaN NaN NaN NaN 33.904295 NaN NaN 26.740000 26.670000 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 21.445908 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 25.840000 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 26.690001 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 25.780001 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 17862400.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
852845 2013-01-04 26.700001 26.830000 26.549999 26.740000 21.502203 7696000.0 MDLZ Mondelez International Consumer Staples Packaged Foods & Meats Chicago, Illinois 2012-10-02 1103982 2012 26.653333 NaN NaN NaN NaN NaN 48.695375 NaN NaN 26.660000 26.639999 NaN NaN NaN NaN NaN NaN 26.683333 NaN NaN NaN NaN NaN 21.421791 NaN NaN NaN NaN NaN NaN 21.456634 NaN NaN NaN NaN NaN 26.700001 NaN NaN NaN NaN NaN NaN 26.413334 NaN NaN NaN NaN NaN 26.770000 NaN NaN NaN NaN NaN NaN 26.763334 NaN NaN NaN NaN NaN 26.490000 NaN NaN NaN NaN NaN NaN 26.273333 NaN NaN NaN NaN NaN 9075500.0 NaN NaN NaN NaN NaN NaN 1.154463e+07 NaN NaN NaN NaN NaN
852846 2013-01-07 26.620001 26.740000 26.549999 26.660000 21.437866 7576200.0 MDLZ Mondelez International Consumer Staples Packaged Foods & Meats Chicago, Illinois 2012-10-02 1103982 2012 26.688852 NaN NaN NaN NaN NaN 39.919745 NaN NaN 26.680000 26.740000 26.670000 NaN NaN NaN NaN NaN 26.680000 NaN NaN NaN NaN NaN 21.502203 21.445908 NaN NaN NaN NaN NaN 21.453953 NaN NaN NaN NaN NaN 26.700001 25.840000 NaN NaN NaN NaN NaN 26.673334 NaN NaN NaN NaN NaN 26.830000 26.690001 NaN NaN NaN NaN NaN 26.780000 NaN NaN NaN NaN NaN 26.549999 25.780001 NaN NaN NaN NaN NaN 26.529999 NaN NaN NaN NaN NaN 7696000.0 17862400.0 NaN NaN NaN NaN NaN 8.115900e+06 NaN NaN NaN NaN NaN
852847 2013-01-08 26.520000 26.920000 26.459999 26.680000 21.453959 14360800.0 MDLZ Mondelez International Consumer Staples Packaged Foods & Meats Chicago, Illinois 2012-10-02 1103982 2012 26.679078 NaN NaN NaN NaN NaN 55.233142 NaN NaN 27.049999 26.660000 26.639999 NaN NaN NaN NaN NaN 26.693333 26.678 NaN NaN NaN NaN 21.437866 21.421791 NaN NaN NaN NaN NaN 21.464676 21.452345 NaN NaN NaN NaN 26.620001 26.700001 NaN NaN NaN NaN NaN 26.613334 26.476001 NaN NaN NaN NaN 26.740000 26.770000 NaN NaN NaN NaN NaN 26.830000 26.79 NaN NaN NaN NaN 26.549999 26.490000 NaN NaN NaN NaN NaN 26.519999 26.366 NaN NaN NaN NaN 7576200.0 9075500.0 NaN NaN NaN NaN NaN 9.877667e+06 11314180.0 NaN NaN NaN NaN

1.d. Remove missing records after feature engineering¶

In [21]:
# Check if the DataFrame contains any missing values
if df.isna().any().any():
    # Remove records with missing values and reset the index
    df = df.dropna().reset_index(drop=True)
    print("Missing records removed. DataFrame is now cleaned.")
else:
    df = df.copy()
    print("No missing records found. DataFrame remains unchanged.")

# Display the first few rows of the cleaned DataFrame
df.head()
Missing records removed. DataFrame is now cleaned.
Out[21]:
date open high low close adj close volume symbol security gics sector gics sub-industry headquarters location date added cik founded ema_9 sma_5 sma_10 sma_15 sma_30 rsi mfi macd macd_signal close_1d_next close_1d_ago close_3d_ago close_5d_ago close_7d_ago close_14d_ago close_21d_ago close_28d_ago close_3d_avg close_5d_avg close_7d_avg close_10d_avg close_15d_avg close_30d_avg adj close_1d_ago adj close_3d_ago adj close_5d_ago adj close_7d_ago adj close_14d_ago adj close_21d_ago adj close_28d_ago adj close_3d_avg adj close_5d_avg adj close_7d_avg adj close_10d_avg adj close_15d_avg adj close_30d_avg open_1d_ago open_3d_ago open_5d_ago open_7d_ago open_14d_ago open_21d_ago open_28d_ago open_3d_avg open_5d_avg open_7d_avg open_10d_avg open_15d_avg open_30d_avg high_1d_ago high_3d_ago high_5d_ago high_7d_ago high_14d_ago high_21d_ago high_28d_ago high_3d_avg high_5d_avg high_7d_avg high_10d_avg high_15d_avg high_30d_avg low_1d_ago low_3d_ago low_5d_ago low_7d_ago low_14d_ago low_21d_ago low_28d_ago low_3d_avg low_5d_avg low_7d_avg low_10d_avg low_15d_avg low_30d_avg volume_1d_ago volume_3d_ago volume_5d_ago volume_7d_ago volume_14d_ago volume_21d_ago volume_28d_ago volume_3d_avg volume_5d_avg volume_7d_avg volume_10d_avg volume_15d_avg volume_30d_avg
0 2013-02-20 27.070000 27.150000 26.950001 27.030001 21.735399 17057200.0 MDLZ Mondelez International Consumer Staples Packaged Foods & Meats Chicago, Illinois 2012-10-02 1103982 2012 27.307926 27.136 27.518 27.642000 27.589000 41.633625 53.176274 -0.147786 -0.050945 26.820000 26.959999 26.570000 27.680000 27.76 27.730000 28.080000 27.049999 26.903333 27.006 27.208571 27.426 27.588667 27.601333 21.679117 21.365499 22.258080 22.322405 22.298285 22.579723 21.751484 21.633545 21.716101 21.878994 22.053831 22.184635 22.194819 26.750000 26.690001 27.700001 27.799999 27.830000 27.969999 26.790001 26.886667 27.018 27.217143 27.386 27.553333 27.536667 27.190001 27.020000 27.830000 28.100000 27.980000 28.100000 27.080000 27.136667 27.248000 27.410000 27.618 27.779333 27.754667 26.750000 26.450001 27.270000 27.750000 27.67 27.820000 26.68 26.766667 26.842 27.015714 27.224 27.411333 27.382333 18297500.0 37728900.0 14931000.0 11159200.0 5800400.0 15906900.0 11671400.0 1.904973e+07 21756140.0 1.907480e+07 17005580.0 1.419575e+07 1.352419e+07
1 2013-02-21 26.990000 27.049999 26.639999 26.820000 21.566534 16936600.0 MDLZ Mondelez International Consumer Staples Packaged Foods & Meats Chicago, Illinois 2012-10-02 1103982 2012 27.252312 27.006 27.426 27.588667 27.601333 38.257648 47.431888 -0.175230 -0.078792 26.770000 27.030001 26.719999 27.750000 27.75 27.790001 27.559999 27.309999 26.936666 26.820 27.075714 27.308 27.528000 27.606000 21.735399 21.486118 22.314371 22.314371 22.346525 22.161583 21.960548 21.660350 21.566534 21.772160 21.958945 22.135851 22.198572 27.070000 26.840000 27.740000 27.730000 27.650000 27.730000 27.129999 26.936666 26.868 27.111429 27.295 27.497333 27.552333 27.150000 27.070000 27.809999 27.799999 27.950001 28.040001 27.340000 27.130000 27.096000 27.302857 27.510 27.717333 27.759000 26.950001 26.600000 27.459999 27.629999 27.65 27.299999 27.09 26.780000 26.678 26.874286 27.108 27.342667 27.388333 17057200.0 21794500.0 13902600.0 9811900.0 7541300.0 18213200.0 16348500.0 1.743043e+07 22362940.0 2.009261e+07 17608410.0 1.493817e+07 1.361005e+07
2 2013-02-22 26.889999 27.129999 26.730000 26.770000 21.526327 16664800.0 MDLZ Mondelez International Consumer Staples Packaged Foods & Meats Chicago, Illinois 2012-10-02 1103982 2012 27.165815 26.820 27.308 27.528000 27.606000 37.478423 48.958416 -0.198438 -0.104970 26.490000 26.820000 26.959999 26.570000 27.68 28.219999 27.790001 27.420000 26.873334 26.860 26.945714 27.181 27.460000 27.596667 21.566534 21.679117 21.365499 22.258080 22.692308 22.346525 22.049007 21.609420 21.598699 21.667624 21.856822 22.081171 22.191067 26.990000 26.750000 26.690001 27.700001 28.000000 27.500000 27.350000 26.983333 26.908 26.995714 27.220 27.446667 27.555667 27.049999 27.190001 27.020000 27.830000 28.320000 27.889999 27.540001 27.109999 27.118000 27.202857 27.415 27.662666 27.760667 26.639999 26.750000 26.450001 27.270000 27.93 27.350000 27.25 26.773333 26.734 26.797143 27.023 27.281333 27.390000 16936600.0 18297500.0 37728900.0 14931000.0 9623100.0 15212300.0 10162600.0 1.688620e+07 18150120.0 2.034030e+07 17828420.0 1.554640e+07 1.377650e+07
3 2013-02-25 26.790001 27.080000 26.480000 26.490000 21.301172 15527100.0 MDLZ Mondelez International Consumer Staples Packaged Foods & Meats Chicago, Illinois 2012-10-02 1103982 2012 27.086626 26.860 27.181 27.460000 27.596667 33.378362 47.675126 -0.235462 -0.132994 26.950001 26.770000 27.030001 26.719999 27.75 27.879999 27.830000 27.480000 26.693333 26.814 26.765714 27.054 27.344667 27.569333 21.526327 21.735399 21.486118 22.314371 22.418896 22.378695 22.097254 21.464678 21.561710 21.522881 21.754699 21.988429 22.169087 26.889999 27.070000 26.840000 27.740000 28.010000 27.930000 27.459999 26.890000 26.898 26.860000 27.119 27.366000 27.544333 27.129999 27.150000 27.070000 27.809999 28.150000 28.030001 27.520000 27.086666 27.120000 27.098571 27.313 27.580000 27.752000 26.730000 26.950001 26.600000 27.459999 27.83 27.639999 27.17 26.616666 26.710 26.657143 26.896 27.184667 27.369667 16664800.0 17057200.0 21794500.0 13902600.0 8954300.0 14444500.0 8688200.0 1.637617e+07 16896640.0 2.057237e+07 18265210.0 1.594000e+07 1.374912e+07
4 2013-02-26 26.530001 26.980000 26.510000 26.950001 21.671074 13702900.0 MDLZ Mondelez International Consumer Staples Packaged Foods & Meats Chicago, Illinois 2012-10-02 1103982 2012 26.967270 26.814 27.054 27.344667 27.569333 44.181951 48.178912 -0.226841 -0.152855 27.570000 26.490000 26.820000 26.959999 26.57 27.950001 27.780001 27.709999 26.736667 26.812 26.820000 26.974 27.282667 27.553667 21.301172 21.566534 21.679117 21.365499 22.475189 22.338484 22.282200 21.499524 21.560101 21.566535 21.690369 21.938574 22.156490 26.790001 26.990000 26.750000 26.690001 27.950001 27.830000 27.580000 26.736667 26.854 26.837143 26.999 27.267333 27.517000 27.080000 27.049999 27.190001 27.020000 28.110001 27.889999 27.740000 27.063333 27.077999 27.092857 27.231 27.502000 27.733333 26.480000 26.639999 26.750000 26.450001 27.85 27.690001 27.34 26.573333 26.662 26.665714 26.784 27.096667 27.345000 15527100.0 16936600.0 18297500.0 37728900.0 10961400.0 12066800.0 9863200.0 1.529827e+07 15977720.0 1.714009e+07 18654310.0 1.625657e+07 1.386713e+07

1.e. Split Data into Train ( before Covid ) and Test ( after Covid )¶

In [22]:
# # Calculate the index for the 70-30 split
# split_index = int(0.7 * len(df))

# # Split the DataFrame into training and testing sets
# train_df = df.iloc[:split_index]
# test_df = df.iloc[split_index:]

# Split the DataFrame into training and testing sets
train_df = df[df.date.dt.year<2020]
test_df = df[df.date.dt.year>=2020]



print(f"Train days: {len(train_df)}, Test days: {len(test_df)}")

fig = go.Figure()
fig.add_trace(go.Scatter(x=train_df.date, y=train_df.close_1d_next, name='Training'))
fig.add_trace(go.Scatter(x=test_df.date,  y=test_df.close_1d_next,  name='Test'))
fig.show()
Train days: 1729, Test days: 970
In [23]:
drop_cols1 = ['date','open','high','low','close','adj close','volume','symbol','security',
 'gics sector','gics sub-industry','headquarters location','date added','cik','founded']

train_df = train_df.drop(drop_cols1, 1)

test_df  = test_df.drop(drop_cols1, 1)

# target column is next day's close price
y_train = train_df['close_1d_next'].copy()
X_train = train_df.drop(['close_1d_next'], 1)

# target column is next day's close price
y_test  = test_df['close_1d_next'].copy()
X_test  = test_df.drop(['close_1d_next'], 1)
In [24]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
In [25]:
X_train.shape, X_train_scaled.shape, X_test.shape, X_test_scaled.shape, 
Out[25]:
((1729, 87), (1729, 87), (970, 87), (970, 87))

2. Train multiple regression models¶

In [26]:
def train_and_evaluate_models(X_train_scaled,y_train,X_test_scaled,y_test):
    """
    Train and evaluate multiple regression models on a given dataframe.

    Parameters:
    - dataframe: Pandas DataFrame containing the dataset.
    - target_column: Name of the target column (dependent variable).
    - features_columns: List of column names used as features (independent variables).

    Returns:
    - A DataFrame containing evaluation metrics for each model.
    """

    # Split the data into features (X) and target variable (y)
#     X = dataframe[features_columns]
#     y = dataframe[target_column]

    # Split the data into training and testing sets (70-30 split)
    #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    
#     scaler = StandardScaler()
#     X_train_scaled = scaler.fit_transform(X_train)
#     X_test_scaled = scaler.transform(X_test)


    # Initialize the models
    models = {
        'Linear Regression': LinearRegression(),
        'Ridge Regression': Ridge(),
        'Lasso Regression': Lasso(),
        'Elastic Net': ElasticNet(),
        'SVR': SVR(),
        'K-Neighbors Regressor': KNeighborsRegressor(),
        'Decision Tree': DecisionTreeRegressor(),
        'Random Forest': RandomForestRegressor(),
        'Gradient Boosting': GradientBoostingRegressor(),
        'AdaBoost': AdaBoostRegressor(),
        'XGBoost': XGBRegressor(),
        'CatBoost': CatBoostRegressor()
    }

    # Initialize a DataFrame to store the evaluation metrics
    metrics_df = pd.DataFrame(columns=['Model', 'Mean Squared Error', 'Mean Absolute Error', 'R2 Score'])

    # Train and evaluate each model
    for model_name, model in models.items():
        start_time = time.time() 
       
    # Train the model
        model.fit(X_train_scaled, y_train)
        
        end_time = time.time()  # Record the end time

        training_time = end_time - start_time

        # Make predictions
        y_pred = model.predict(X_test_scaled)

        # Evaluate the model
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        # Store the metrics in the DataFrame
        metrics_df = metrics_df.append({
            'Model': model_name,
            'Mean Squared Error': mse,
            'Mean Absolute Error': mae,
            'R2 Score': r2,
            'Training Time (s)': training_time
        }, ignore_index=True)
        metrics_df = metrics_df.sort_values(by=['R2 Score'],ascending=False)
    return metrics_df
In [27]:
df_compare = train_and_evaluate_models(X_train,y_train,X_test,y_test)
Learning rate set to 0.044643
0:	learn: 6.0603979	total: 63.2ms	remaining: 1m 3s
1:	learn: 5.8169612	total: 69.1ms	remaining: 34.5s
2:	learn: 5.5755845	total: 74ms	remaining: 24.6s
3:	learn: 5.3462380	total: 78.7ms	remaining: 19.6s
4:	learn: 5.1261132	total: 83.9ms	remaining: 16.7s
5:	learn: 4.9208602	total: 88.5ms	remaining: 14.7s
6:	learn: 4.7250714	total: 93.4ms	remaining: 13.2s
7:	learn: 4.5456470	total: 97.7ms	remaining: 12.1s
8:	learn: 4.3606091	total: 102ms	remaining: 11.3s
9:	learn: 4.1830096	total: 107ms	remaining: 10.6s
10:	learn: 4.0166823	total: 111ms	remaining: 10s
11:	learn: 3.8657076	total: 117ms	remaining: 9.62s
12:	learn: 3.7140111	total: 130ms	remaining: 9.84s
13:	learn: 3.5654097	total: 140ms	remaining: 9.83s
14:	learn: 3.4264460	total: 145ms	remaining: 9.54s
15:	learn: 3.2905647	total: 150ms	remaining: 9.22s
16:	learn: 3.1619168	total: 155ms	remaining: 8.95s
17:	learn: 3.0386276	total: 161ms	remaining: 8.76s
18:	learn: 2.9250019	total: 167ms	remaining: 8.61s
19:	learn: 2.8155633	total: 172ms	remaining: 8.42s
20:	learn: 2.7067078	total: 176ms	remaining: 8.2s
21:	learn: 2.6104041	total: 180ms	remaining: 8s
22:	learn: 2.5171443	total: 186ms	remaining: 7.91s
23:	learn: 2.4234694	total: 191ms	remaining: 7.75s
24:	learn: 2.3336920	total: 195ms	remaining: 7.62s
25:	learn: 2.2501270	total: 201ms	remaining: 7.52s
26:	learn: 2.1685983	total: 206ms	remaining: 7.41s
27:	learn: 2.0914990	total: 212ms	remaining: 7.35s
28:	learn: 2.0188179	total: 217ms	remaining: 7.25s
29:	learn: 1.9490958	total: 221ms	remaining: 7.14s
30:	learn: 1.8819748	total: 225ms	remaining: 7.04s
31:	learn: 1.8183164	total: 230ms	remaining: 6.95s
32:	learn: 1.7609789	total: 234ms	remaining: 6.86s
33:	learn: 1.7008286	total: 238ms	remaining: 6.77s
34:	learn: 1.6438044	total: 243ms	remaining: 6.69s
35:	learn: 1.5954289	total: 247ms	remaining: 6.61s
36:	learn: 1.5458456	total: 251ms	remaining: 6.53s
37:	learn: 1.4999957	total: 255ms	remaining: 6.46s
38:	learn: 1.4557749	total: 259ms	remaining: 6.39s
39:	learn: 1.4136122	total: 264ms	remaining: 6.33s
40:	learn: 1.3740245	total: 268ms	remaining: 6.28s
41:	learn: 1.3360847	total: 273ms	remaining: 6.21s
42:	learn: 1.3006229	total: 277ms	remaining: 6.16s
43:	learn: 1.2683312	total: 281ms	remaining: 6.11s
44:	learn: 1.2328651	total: 285ms	remaining: 6.05s
45:	learn: 1.2022101	total: 289ms	remaining: 6s
46:	learn: 1.1716981	total: 294ms	remaining: 5.95s
47:	learn: 1.1425806	total: 298ms	remaining: 5.91s
48:	learn: 1.1126613	total: 302ms	remaining: 5.86s
49:	learn: 1.0845503	total: 306ms	remaining: 5.82s
50:	learn: 1.0590177	total: 311ms	remaining: 5.78s
51:	learn: 1.0347707	total: 315ms	remaining: 5.74s
52:	learn: 1.0113231	total: 319ms	remaining: 5.7s
53:	learn: 0.9886945	total: 323ms	remaining: 5.66s
54:	learn: 0.9669580	total: 328ms	remaining: 5.63s
55:	learn: 0.9468328	total: 332ms	remaining: 5.59s
56:	learn: 0.9270737	total: 336ms	remaining: 5.56s
57:	learn: 0.9100334	total: 340ms	remaining: 5.52s
58:	learn: 0.8917612	total: 345ms	remaining: 5.5s
59:	learn: 0.8750815	total: 349ms	remaining: 5.47s
60:	learn: 0.8601060	total: 353ms	remaining: 5.44s
61:	learn: 0.8462298	total: 358ms	remaining: 5.41s
62:	learn: 0.8345633	total: 362ms	remaining: 5.38s
63:	learn: 0.8216135	total: 367ms	remaining: 5.36s
64:	learn: 0.8083116	total: 371ms	remaining: 5.34s
65:	learn: 0.7956239	total: 375ms	remaining: 5.31s
66:	learn: 0.7843834	total: 380ms	remaining: 5.29s
67:	learn: 0.7740919	total: 384ms	remaining: 5.26s
68:	learn: 0.7643108	total: 388ms	remaining: 5.24s
69:	learn: 0.7561664	total: 393ms	remaining: 5.22s
70:	learn: 0.7491327	total: 397ms	remaining: 5.2s
71:	learn: 0.7411352	total: 403ms	remaining: 5.2s
72:	learn: 0.7330505	total: 408ms	remaining: 5.18s
73:	learn: 0.7269962	total: 413ms	remaining: 5.17s
74:	learn: 0.7192950	total: 418ms	remaining: 5.15s
75:	learn: 0.7126213	total: 422ms	remaining: 5.13s
76:	learn: 0.7067352	total: 426ms	remaining: 5.11s
77:	learn: 0.7006922	total: 430ms	remaining: 5.08s
78:	learn: 0.6944183	total: 435ms	remaining: 5.07s
79:	learn: 0.6885217	total: 439ms	remaining: 5.05s
80:	learn: 0.6825927	total: 443ms	remaining: 5.03s
81:	learn: 0.6783627	total: 448ms	remaining: 5.01s
82:	learn: 0.6741644	total: 452ms	remaining: 5s
83:	learn: 0.6696567	total: 456ms	remaining: 4.98s
84:	learn: 0.6646154	total: 461ms	remaining: 4.96s
85:	learn: 0.6599463	total: 465ms	remaining: 4.95s
86:	learn: 0.6560602	total: 470ms	remaining: 4.93s
87:	learn: 0.6536929	total: 474ms	remaining: 4.91s
88:	learn: 0.6496752	total: 478ms	remaining: 4.89s
89:	learn: 0.6462499	total: 483ms	remaining: 4.89s
90:	learn: 0.6428735	total: 488ms	remaining: 4.88s
91:	learn: 0.6397495	total: 494ms	remaining: 4.87s
92:	learn: 0.6367543	total: 499ms	remaining: 4.86s
93:	learn: 0.6337726	total: 503ms	remaining: 4.84s
94:	learn: 0.6308954	total: 507ms	remaining: 4.83s
95:	learn: 0.6280612	total: 511ms	remaining: 4.81s
96:	learn: 0.6250764	total: 515ms	remaining: 4.79s
97:	learn: 0.6223572	total: 519ms	remaining: 4.78s
98:	learn: 0.6205398	total: 524ms	remaining: 4.77s
99:	learn: 0.6186628	total: 528ms	remaining: 4.75s
100:	learn: 0.6154184	total: 532ms	remaining: 4.74s
101:	learn: 0.6124974	total: 536ms	remaining: 4.72s
102:	learn: 0.6095369	total: 541ms	remaining: 4.71s
103:	learn: 0.6081161	total: 545ms	remaining: 4.69s
104:	learn: 0.6061648	total: 549ms	remaining: 4.68s
105:	learn: 0.6037361	total: 554ms	remaining: 4.67s
106:	learn: 0.6018697	total: 558ms	remaining: 4.66s
107:	learn: 0.5996563	total: 562ms	remaining: 4.64s
108:	learn: 0.5987782	total: 567ms	remaining: 4.63s
109:	learn: 0.5962953	total: 571ms	remaining: 4.62s
110:	learn: 0.5944525	total: 575ms	remaining: 4.61s
111:	learn: 0.5925792	total: 580ms	remaining: 4.59s
112:	learn: 0.5908725	total: 584ms	remaining: 4.59s
113:	learn: 0.5892615	total: 590ms	remaining: 4.58s
114:	learn: 0.5876082	total: 595ms	remaining: 4.58s
115:	learn: 0.5856574	total: 600ms	remaining: 4.58s
116:	learn: 0.5839197	total: 606ms	remaining: 4.57s
117:	learn: 0.5821181	total: 611ms	remaining: 4.57s
118:	learn: 0.5802687	total: 617ms	remaining: 4.57s
119:	learn: 0.5783092	total: 621ms	remaining: 4.55s
120:	learn: 0.5768204	total: 625ms	remaining: 4.54s
121:	learn: 0.5753784	total: 629ms	remaining: 4.53s
122:	learn: 0.5739414	total: 634ms	remaining: 4.52s
123:	learn: 0.5731172	total: 638ms	remaining: 4.51s
124:	learn: 0.5717661	total: 642ms	remaining: 4.5s
125:	learn: 0.5699490	total: 647ms	remaining: 4.49s
126:	learn: 0.5692480	total: 651ms	remaining: 4.47s
127:	learn: 0.5683714	total: 655ms	remaining: 4.46s
128:	learn: 0.5669671	total: 659ms	remaining: 4.45s
129:	learn: 0.5661305	total: 664ms	remaining: 4.44s
130:	learn: 0.5648214	total: 668ms	remaining: 4.43s
131:	learn: 0.5632118	total: 672ms	remaining: 4.42s
132:	learn: 0.5621830	total: 676ms	remaining: 4.41s
133:	learn: 0.5614073	total: 680ms	remaining: 4.4s
134:	learn: 0.5607392	total: 685ms	remaining: 4.39s
135:	learn: 0.5592399	total: 689ms	remaining: 4.38s
136:	learn: 0.5582825	total: 694ms	remaining: 4.37s
137:	learn: 0.5570203	total: 698ms	remaining: 4.36s
138:	learn: 0.5555467	total: 702ms	remaining: 4.35s
139:	learn: 0.5544866	total: 707ms	remaining: 4.34s
140:	learn: 0.5534618	total: 712ms	remaining: 4.34s
141:	learn: 0.5527745	total: 716ms	remaining: 4.33s
142:	learn: 0.5517276	total: 720ms	remaining: 4.32s
143:	learn: 0.5511754	total: 725ms	remaining: 4.31s
144:	learn: 0.5499848	total: 729ms	remaining: 4.3s
145:	learn: 0.5492370	total: 733ms	remaining: 4.29s
146:	learn: 0.5480831	total: 737ms	remaining: 4.28s
147:	learn: 0.5470207	total: 741ms	remaining: 4.26s
148:	learn: 0.5461768	total: 745ms	remaining: 4.26s
149:	learn: 0.5452187	total: 750ms	remaining: 4.25s
150:	learn: 0.5442155	total: 754ms	remaining: 4.24s
151:	learn: 0.5431025	total: 758ms	remaining: 4.23s
152:	learn: 0.5423642	total: 762ms	remaining: 4.22s
153:	learn: 0.5414228	total: 767ms	remaining: 4.21s
154:	learn: 0.5402482	total: 771ms	remaining: 4.2s
155:	learn: 0.5390607	total: 775ms	remaining: 4.19s
156:	learn: 0.5383473	total: 779ms	remaining: 4.18s
157:	learn: 0.5373131	total: 784ms	remaining: 4.18s
158:	learn: 0.5364392	total: 789ms	remaining: 4.17s
159:	learn: 0.5357421	total: 793ms	remaining: 4.16s
160:	learn: 0.5343277	total: 798ms	remaining: 4.16s
161:	learn: 0.5337785	total: 803ms	remaining: 4.16s
162:	learn: 0.5329012	total: 809ms	remaining: 4.15s
163:	learn: 0.5317662	total: 814ms	remaining: 4.15s
164:	learn: 0.5305788	total: 819ms	remaining: 4.14s
165:	learn: 0.5291670	total: 823ms	remaining: 4.13s
166:	learn: 0.5279711	total: 827ms	remaining: 4.13s
167:	learn: 0.5263201	total: 831ms	remaining: 4.12s
168:	learn: 0.5250413	total: 836ms	remaining: 4.11s
169:	learn: 0.5239617	total: 840ms	remaining: 4.1s
170:	learn: 0.5232086	total: 844ms	remaining: 4.09s
171:	learn: 0.5223393	total: 849ms	remaining: 4.09s
172:	learn: 0.5213785	total: 853ms	remaining: 4.08s
173:	learn: 0.5205693	total: 858ms	remaining: 4.07s
174:	learn: 0.5195080	total: 862ms	remaining: 4.06s
175:	learn: 0.5184230	total: 866ms	remaining: 4.05s
176:	learn: 0.5179592	total: 870ms	remaining: 4.05s
177:	learn: 0.5170185	total: 874ms	remaining: 4.04s
178:	learn: 0.5158612	total: 879ms	remaining: 4.03s
179:	learn: 0.5145391	total: 883ms	remaining: 4.02s
180:	learn: 0.5134332	total: 887ms	remaining: 4.01s
181:	learn: 0.5123862	total: 891ms	remaining: 4.01s
182:	learn: 0.5109381	total: 896ms	remaining: 4s
183:	learn: 0.5094690	total: 900ms	remaining: 3.99s
184:	learn: 0.5087636	total: 904ms	remaining: 3.98s
185:	learn: 0.5082193	total: 908ms	remaining: 3.97s
186:	learn: 0.5077741	total: 912ms	remaining: 3.97s
187:	learn: 0.5064784	total: 917ms	remaining: 3.96s
188:	learn: 0.5053126	total: 921ms	remaining: 3.95s
189:	learn: 0.5045861	total: 925ms	remaining: 3.94s
190:	learn: 0.5035784	total: 929ms	remaining: 3.94s
191:	learn: 0.5025377	total: 934ms	remaining: 3.93s
192:	learn: 0.5017748	total: 938ms	remaining: 3.92s
193:	learn: 0.5008812	total: 942ms	remaining: 3.91s
194:	learn: 0.5004256	total: 946ms	remaining: 3.91s
195:	learn: 0.4994655	total: 951ms	remaining: 3.9s
196:	learn: 0.4989930	total: 955ms	remaining: 3.89s
197:	learn: 0.4980951	total: 959ms	remaining: 3.88s
198:	learn: 0.4973099	total: 963ms	remaining: 3.88s
199:	learn: 0.4961640	total: 968ms	remaining: 3.87s
200:	learn: 0.4946290	total: 972ms	remaining: 3.86s
201:	learn: 0.4940315	total: 976ms	remaining: 3.85s
202:	learn: 0.4931395	total: 981ms	remaining: 3.85s
203:	learn: 0.4920884	total: 985ms	remaining: 3.85s
204:	learn: 0.4909474	total: 990ms	remaining: 3.84s
205:	learn: 0.4899796	total: 996ms	remaining: 3.84s
206:	learn: 0.4890645	total: 1s	remaining: 3.83s
207:	learn: 0.4882374	total: 1s	remaining: 3.82s
208:	learn: 0.4871172	total: 1.01s	remaining: 3.82s
209:	learn: 0.4861811	total: 1.01s	remaining: 3.81s
210:	learn: 0.4850126	total: 1.02s	remaining: 3.8s
211:	learn: 0.4837164	total: 1.02s	remaining: 3.8s
212:	learn: 0.4829656	total: 1.02s	remaining: 3.79s
213:	learn: 0.4819662	total: 1.03s	remaining: 3.78s
214:	learn: 0.4817394	total: 1.03s	remaining: 3.77s
215:	learn: 0.4811177	total: 1.04s	remaining: 3.77s
216:	learn: 0.4806224	total: 1.04s	remaining: 3.76s
217:	learn: 0.4800808	total: 1.05s	remaining: 3.75s
218:	learn: 0.4789126	total: 1.05s	remaining: 3.75s
219:	learn: 0.4784037	total: 1.05s	remaining: 3.74s
220:	learn: 0.4774369	total: 1.06s	remaining: 3.74s
221:	learn: 0.4764525	total: 1.06s	remaining: 3.73s
222:	learn: 0.4757860	total: 1.07s	remaining: 3.72s
223:	learn: 0.4748362	total: 1.07s	remaining: 3.72s
224:	learn: 0.4738011	total: 1.08s	remaining: 3.71s
225:	learn: 0.4732533	total: 1.08s	remaining: 3.71s
226:	learn: 0.4724302	total: 1.09s	remaining: 3.7s
227:	learn: 0.4714185	total: 1.09s	remaining: 3.7s
228:	learn: 0.4704901	total: 1.1s	remaining: 3.69s
229:	learn: 0.4696393	total: 1.1s	remaining: 3.68s
230:	learn: 0.4688642	total: 1.1s	remaining: 3.68s
231:	learn: 0.4684056	total: 1.11s	remaining: 3.67s
232:	learn: 0.4680263	total: 1.11s	remaining: 3.66s
233:	learn: 0.4673195	total: 1.12s	remaining: 3.66s
234:	learn: 0.4661429	total: 1.12s	remaining: 3.65s
235:	learn: 0.4651517	total: 1.13s	remaining: 3.65s
236:	learn: 0.4643784	total: 1.13s	remaining: 3.65s
237:	learn: 0.4639422	total: 1.14s	remaining: 3.64s
238:	learn: 0.4629527	total: 1.14s	remaining: 3.63s
239:	learn: 0.4621494	total: 1.15s	remaining: 3.63s
240:	learn: 0.4611603	total: 1.15s	remaining: 3.62s
241:	learn: 0.4603838	total: 1.15s	remaining: 3.62s
242:	learn: 0.4593452	total: 1.16s	remaining: 3.61s
243:	learn: 0.4586434	total: 1.16s	remaining: 3.6s
244:	learn: 0.4577788	total: 1.17s	remaining: 3.6s
245:	learn: 0.4576161	total: 1.17s	remaining: 3.59s
246:	learn: 0.4570526	total: 1.18s	remaining: 3.59s
247:	learn: 0.4560632	total: 1.18s	remaining: 3.59s
248:	learn: 0.4552049	total: 1.19s	remaining: 3.59s
249:	learn: 0.4544847	total: 1.2s	remaining: 3.59s
250:	learn: 0.4536628	total: 1.2s	remaining: 3.58s
251:	learn: 0.4529050	total: 1.21s	remaining: 3.58s
252:	learn: 0.4521824	total: 1.21s	remaining: 3.57s
253:	learn: 0.4515072	total: 1.21s	remaining: 3.57s
254:	learn: 0.4506698	total: 1.22s	remaining: 3.56s
255:	learn: 0.4501205	total: 1.22s	remaining: 3.56s
256:	learn: 0.4493848	total: 1.23s	remaining: 3.55s
257:	learn: 0.4486159	total: 1.23s	remaining: 3.55s
258:	learn: 0.4482442	total: 1.24s	remaining: 3.54s
259:	learn: 0.4478827	total: 1.24s	remaining: 3.54s
260:	learn: 0.4473978	total: 1.25s	remaining: 3.53s
261:	learn: 0.4468994	total: 1.25s	remaining: 3.53s
262:	learn: 0.4456701	total: 1.26s	remaining: 3.52s
263:	learn: 0.4449110	total: 1.26s	remaining: 3.52s
264:	learn: 0.4442542	total: 1.27s	remaining: 3.51s
265:	learn: 0.4431941	total: 1.27s	remaining: 3.51s
266:	learn: 0.4425103	total: 1.27s	remaining: 3.5s
267:	learn: 0.4416661	total: 1.28s	remaining: 3.49s
268:	learn: 0.4411630	total: 1.28s	remaining: 3.49s
269:	learn: 0.4404441	total: 1.29s	remaining: 3.48s
270:	learn: 0.4400363	total: 1.29s	remaining: 3.48s
271:	learn: 0.4398475	total: 1.3s	remaining: 3.47s
272:	learn: 0.4390363	total: 1.3s	remaining: 3.47s
273:	learn: 0.4381161	total: 1.31s	remaining: 3.46s
274:	learn: 0.4374399	total: 1.31s	remaining: 3.46s
275:	learn: 0.4370343	total: 1.31s	remaining: 3.45s
276:	learn: 0.4366440	total: 1.32s	remaining: 3.45s
277:	learn: 0.4363497	total: 1.32s	remaining: 3.44s
278:	learn: 0.4360749	total: 1.33s	remaining: 3.43s
279:	learn: 0.4353030	total: 1.33s	remaining: 3.43s
280:	learn: 0.4346082	total: 1.34s	remaining: 3.42s
281:	learn: 0.4342305	total: 1.34s	remaining: 3.42s
282:	learn: 0.4338120	total: 1.34s	remaining: 3.41s
283:	learn: 0.4336984	total: 1.35s	remaining: 3.4s
284:	learn: 0.4329844	total: 1.35s	remaining: 3.4s
285:	learn: 0.4325585	total: 1.36s	remaining: 3.39s
286:	learn: 0.4316236	total: 1.36s	remaining: 3.38s
287:	learn: 0.4312618	total: 1.37s	remaining: 3.38s
288:	learn: 0.4305788	total: 1.37s	remaining: 3.37s
289:	learn: 0.4299323	total: 1.38s	remaining: 3.37s
290:	learn: 0.4294147	total: 1.38s	remaining: 3.37s
291:	learn: 0.4287311	total: 1.39s	remaining: 3.36s
292:	learn: 0.4281021	total: 1.39s	remaining: 3.35s
293:	learn: 0.4274824	total: 1.39s	remaining: 3.35s
294:	learn: 0.4266494	total: 1.4s	remaining: 3.34s
295:	learn: 0.4259133	total: 1.4s	remaining: 3.34s
296:	learn: 0.4254208	total: 1.41s	remaining: 3.33s
297:	learn: 0.4248581	total: 1.41s	remaining: 3.33s
298:	learn: 0.4244197	total: 1.42s	remaining: 3.33s
299:	learn: 0.4238834	total: 1.42s	remaining: 3.32s
300:	learn: 0.4235753	total: 1.43s	remaining: 3.31s
301:	learn: 0.4230278	total: 1.43s	remaining: 3.31s
302:	learn: 0.4223168	total: 1.44s	remaining: 3.3s
303:	learn: 0.4218202	total: 1.44s	remaining: 3.3s
304:	learn: 0.4213324	total: 1.44s	remaining: 3.29s
305:	learn: 0.4209713	total: 1.45s	remaining: 3.29s
306:	learn: 0.4206682	total: 1.45s	remaining: 3.28s
307:	learn: 0.4205730	total: 1.46s	remaining: 3.27s
308:	learn: 0.4200366	total: 1.46s	remaining: 3.27s
309:	learn: 0.4196622	total: 1.47s	remaining: 3.26s
310:	learn: 0.4188254	total: 1.47s	remaining: 3.25s
311:	learn: 0.4182222	total: 1.47s	remaining: 3.25s
312:	learn: 0.4181272	total: 1.48s	remaining: 3.24s
313:	learn: 0.4178863	total: 1.48s	remaining: 3.24s
314:	learn: 0.4171997	total: 1.49s	remaining: 3.23s
315:	learn: 0.4165719	total: 1.49s	remaining: 3.23s
316:	learn: 0.4158275	total: 1.49s	remaining: 3.22s
317:	learn: 0.4150223	total: 1.5s	remaining: 3.21s
318:	learn: 0.4144227	total: 1.5s	remaining: 3.21s
319:	learn: 0.4138124	total: 1.51s	remaining: 3.2s
320:	learn: 0.4135332	total: 1.51s	remaining: 3.2s
321:	learn: 0.4129018	total: 1.51s	remaining: 3.19s
322:	learn: 0.4123695	total: 1.52s	remaining: 3.18s
323:	learn: 0.4116624	total: 1.52s	remaining: 3.18s
324:	learn: 0.4108634	total: 1.53s	remaining: 3.17s
325:	learn: 0.4103299	total: 1.53s	remaining: 3.17s
326:	learn: 0.4096449	total: 1.54s	remaining: 3.16s
327:	learn: 0.4092467	total: 1.54s	remaining: 3.16s
328:	learn: 0.4090363	total: 1.54s	remaining: 3.15s
329:	learn: 0.4080857	total: 1.55s	remaining: 3.14s
330:	learn: 0.4077160	total: 1.55s	remaining: 3.14s
331:	learn: 0.4071226	total: 1.56s	remaining: 3.13s
332:	learn: 0.4067749	total: 1.56s	remaining: 3.13s
333:	learn: 0.4062413	total: 1.57s	remaining: 3.12s
334:	learn: 0.4055695	total: 1.57s	remaining: 3.12s
335:	learn: 0.4054942	total: 1.57s	remaining: 3.11s
336:	learn: 0.4047835	total: 1.58s	remaining: 3.11s
337:	learn: 0.4042196	total: 1.58s	remaining: 3.1s
338:	learn: 0.4038735	total: 1.59s	remaining: 3.1s
339:	learn: 0.4036386	total: 1.59s	remaining: 3.1s
340:	learn: 0.4033409	total: 1.6s	remaining: 3.09s
341:	learn: 0.4026643	total: 1.6s	remaining: 3.08s
342:	learn: 0.4023055	total: 1.61s	remaining: 3.08s
343:	learn: 0.4021658	total: 1.61s	remaining: 3.07s
344:	learn: 0.4018412	total: 1.62s	remaining: 3.07s
345:	learn: 0.4013429	total: 1.62s	remaining: 3.06s
346:	learn: 0.4008246	total: 1.63s	remaining: 3.06s
347:	learn: 0.4002585	total: 1.63s	remaining: 3.05s
348:	learn: 0.3998110	total: 1.63s	remaining: 3.05s
349:	learn: 0.3990401	total: 1.64s	remaining: 3.04s
350:	learn: 0.3982868	total: 1.64s	remaining: 3.04s
351:	learn: 0.3977941	total: 1.65s	remaining: 3.03s
352:	learn: 0.3971540	total: 1.65s	remaining: 3.03s
353:	learn: 0.3966626	total: 1.66s	remaining: 3.02s
354:	learn: 0.3965989	total: 1.66s	remaining: 3.01s
355:	learn: 0.3962189	total: 1.66s	remaining: 3.01s
356:	learn: 0.3958068	total: 1.67s	remaining: 3s
357:	learn: 0.3953095	total: 1.67s	remaining: 3s
358:	learn: 0.3948418	total: 1.68s	remaining: 2.99s
359:	learn: 0.3946123	total: 1.68s	remaining: 2.99s
360:	learn: 0.3938100	total: 1.69s	remaining: 2.98s
361:	learn: 0.3932202	total: 1.69s	remaining: 2.98s
362:	learn: 0.3925485	total: 1.69s	remaining: 2.97s
363:	learn: 0.3918346	total: 1.7s	remaining: 2.97s
364:	learn: 0.3915180	total: 1.7s	remaining: 2.96s
365:	learn: 0.3907133	total: 1.71s	remaining: 2.96s
366:	learn: 0.3905787	total: 1.71s	remaining: 2.95s
367:	learn: 0.3900100	total: 1.72s	remaining: 2.95s
368:	learn: 0.3894581	total: 1.72s	remaining: 2.94s
369:	learn: 0.3887745	total: 1.73s	remaining: 2.94s
370:	learn: 0.3881815	total: 1.73s	remaining: 2.93s
371:	learn: 0.3877839	total: 1.73s	remaining: 2.93s
372:	learn: 0.3872649	total: 1.74s	remaining: 2.92s
373:	learn: 0.3864811	total: 1.74s	remaining: 2.92s
374:	learn: 0.3859586	total: 1.75s	remaining: 2.91s
375:	learn: 0.3857552	total: 1.75s	remaining: 2.9s
376:	learn: 0.3851249	total: 1.75s	remaining: 2.9s
377:	learn: 0.3846241	total: 1.76s	remaining: 2.9s
378:	learn: 0.3842221	total: 1.76s	remaining: 2.89s
379:	learn: 0.3840501	total: 1.77s	remaining: 2.88s
380:	learn: 0.3839565	total: 1.77s	remaining: 2.88s
381:	learn: 0.3834811	total: 1.78s	remaining: 2.88s
382:	learn: 0.3829302	total: 1.78s	remaining: 2.87s
383:	learn: 0.3824233	total: 1.79s	remaining: 2.87s
384:	learn: 0.3820862	total: 1.79s	remaining: 2.86s
385:	learn: 0.3817059	total: 1.79s	remaining: 2.86s
386:	learn: 0.3810940	total: 1.8s	remaining: 2.85s
387:	learn: 0.3807112	total: 1.8s	remaining: 2.85s
388:	learn: 0.3802605	total: 1.81s	remaining: 2.84s
389:	learn: 0.3798039	total: 1.81s	remaining: 2.83s
390:	learn: 0.3793238	total: 1.82s	remaining: 2.83s
391:	learn: 0.3787842	total: 1.82s	remaining: 2.83s
392:	learn: 0.3783469	total: 1.82s	remaining: 2.82s
393:	learn: 0.3774757	total: 1.83s	remaining: 2.81s
394:	learn: 0.3770911	total: 1.83s	remaining: 2.81s
395:	learn: 0.3767255	total: 1.84s	remaining: 2.8s
396:	learn: 0.3761264	total: 1.84s	remaining: 2.8s
397:	learn: 0.3757592	total: 1.85s	remaining: 2.79s
398:	learn: 0.3751453	total: 1.85s	remaining: 2.79s
399:	learn: 0.3748033	total: 1.85s	remaining: 2.78s
400:	learn: 0.3743184	total: 1.86s	remaining: 2.78s
401:	learn: 0.3740077	total: 1.86s	remaining: 2.77s
402:	learn: 0.3733825	total: 1.87s	remaining: 2.77s
403:	learn: 0.3728581	total: 1.87s	remaining: 2.76s
404:	learn: 0.3722381	total: 1.88s	remaining: 2.76s
405:	learn: 0.3716600	total: 1.88s	remaining: 2.75s
406:	learn: 0.3709735	total: 1.89s	remaining: 2.75s
407:	learn: 0.3706292	total: 1.89s	remaining: 2.74s
408:	learn: 0.3700405	total: 1.89s	remaining: 2.74s
409:	learn: 0.3695724	total: 1.9s	remaining: 2.73s
410:	learn: 0.3688604	total: 1.9s	remaining: 2.73s
411:	learn: 0.3687772	total: 1.91s	remaining: 2.72s
412:	learn: 0.3680599	total: 1.91s	remaining: 2.72s
413:	learn: 0.3676867	total: 1.92s	remaining: 2.71s
414:	learn: 0.3673532	total: 1.92s	remaining: 2.71s
415:	learn: 0.3667493	total: 1.92s	remaining: 2.7s
416:	learn: 0.3663581	total: 1.93s	remaining: 2.69s
417:	learn: 0.3659273	total: 1.93s	remaining: 2.69s
418:	learn: 0.3653585	total: 1.94s	remaining: 2.69s
419:	learn: 0.3647104	total: 1.94s	remaining: 2.68s
420:	learn: 0.3643467	total: 1.94s	remaining: 2.67s
421:	learn: 0.3638706	total: 1.95s	remaining: 2.67s
422:	learn: 0.3637044	total: 1.95s	remaining: 2.66s
423:	learn: 0.3630532	total: 1.96s	remaining: 2.66s
424:	learn: 0.3628976	total: 1.96s	remaining: 2.65s
425:	learn: 0.3622257	total: 1.97s	remaining: 2.65s
426:	learn: 0.3618914	total: 1.97s	remaining: 2.65s
427:	learn: 0.3614434	total: 1.98s	remaining: 2.64s
428:	learn: 0.3610056	total: 1.98s	remaining: 2.64s
429:	learn: 0.3603069	total: 1.99s	remaining: 2.63s
430:	learn: 0.3599433	total: 1.99s	remaining: 2.63s
431:	learn: 0.3598531	total: 1.99s	remaining: 2.62s
432:	learn: 0.3597998	total: 2s	remaining: 2.62s
433:	learn: 0.3592574	total: 2s	remaining: 2.61s
434:	learn: 0.3586937	total: 2.01s	remaining: 2.6s
435:	learn: 0.3582985	total: 2.01s	remaining: 2.6s
436:	learn: 0.3579804	total: 2.01s	remaining: 2.6s
437:	learn: 0.3576664	total: 2.02s	remaining: 2.59s
438:	learn: 0.3572953	total: 2.02s	remaining: 2.58s
439:	learn: 0.3568969	total: 2.03s	remaining: 2.58s
440:	learn: 0.3562306	total: 2.03s	remaining: 2.58s
441:	learn: 0.3555624	total: 2.04s	remaining: 2.57s
442:	learn: 0.3550610	total: 2.04s	remaining: 2.56s
443:	learn: 0.3545855	total: 2.04s	remaining: 2.56s
444:	learn: 0.3541245	total: 2.04s	remaining: 2.55s
445:	learn: 0.3537684	total: 2.05s	remaining: 2.54s
446:	learn: 0.3534726	total: 2.05s	remaining: 2.54s
447:	learn: 0.3533520	total: 2.06s	remaining: 2.53s
448:	learn: 0.3530596	total: 2.06s	remaining: 2.53s
449:	learn: 0.3525138	total: 2.06s	remaining: 2.52s
450:	learn: 0.3520520	total: 2.07s	remaining: 2.52s
451:	learn: 0.3517294	total: 2.07s	remaining: 2.51s
452:	learn: 0.3516567	total: 2.08s	remaining: 2.5s
453:	learn: 0.3513043	total: 2.08s	remaining: 2.5s
454:	learn: 0.3509406	total: 2.08s	remaining: 2.49s
455:	learn: 0.3504919	total: 2.08s	remaining: 2.49s
456:	learn: 0.3498914	total: 2.09s	remaining: 2.48s
457:	learn: 0.3494417	total: 2.09s	remaining: 2.48s
458:	learn: 0.3490944	total: 2.1s	remaining: 2.47s
459:	learn: 0.3487102	total: 2.1s	remaining: 2.46s
460:	learn: 0.3484557	total: 2.1s	remaining: 2.46s
461:	learn: 0.3478789	total: 2.11s	remaining: 2.46s
462:	learn: 0.3474275	total: 2.11s	remaining: 2.45s
463:	learn: 0.3468342	total: 2.12s	remaining: 2.45s
464:	learn: 0.3463040	total: 2.12s	remaining: 2.44s
465:	learn: 0.3458213	total: 2.13s	remaining: 2.44s
466:	learn: 0.3451803	total: 2.13s	remaining: 2.43s
467:	learn: 0.3447292	total: 2.14s	remaining: 2.43s
468:	learn: 0.3443530	total: 2.14s	remaining: 2.42s
469:	learn: 0.3437850	total: 2.15s	remaining: 2.42s
470:	learn: 0.3433565	total: 2.15s	remaining: 2.41s
471:	learn: 0.3430048	total: 2.15s	remaining: 2.41s
472:	learn: 0.3425770	total: 2.16s	remaining: 2.4s
473:	learn: 0.3419536	total: 2.16s	remaining: 2.4s
474:	learn: 0.3414171	total: 2.17s	remaining: 2.4s
475:	learn: 0.3409048	total: 2.17s	remaining: 2.39s
476:	learn: 0.3406389	total: 2.18s	remaining: 2.39s
477:	learn: 0.3402231	total: 2.18s	remaining: 2.38s
478:	learn: 0.3397786	total: 2.19s	remaining: 2.38s
479:	learn: 0.3396398	total: 2.19s	remaining: 2.37s
480:	learn: 0.3395825	total: 2.19s	remaining: 2.37s
481:	learn: 0.3391371	total: 2.2s	remaining: 2.36s
482:	learn: 0.3386234	total: 2.2s	remaining: 2.36s
483:	learn: 0.3382257	total: 2.21s	remaining: 2.35s
484:	learn: 0.3378495	total: 2.21s	remaining: 2.35s
485:	learn: 0.3375195	total: 2.22s	remaining: 2.34s
486:	learn: 0.3371611	total: 2.22s	remaining: 2.34s
487:	learn: 0.3367422	total: 2.22s	remaining: 2.33s
488:	learn: 0.3363054	total: 2.23s	remaining: 2.33s
489:	learn: 0.3359786	total: 2.23s	remaining: 2.32s
490:	learn: 0.3356998	total: 2.24s	remaining: 2.32s
491:	learn: 0.3354453	total: 2.24s	remaining: 2.31s
492:	learn: 0.3352187	total: 2.25s	remaining: 2.31s
493:	learn: 0.3350124	total: 2.25s	remaining: 2.31s
494:	learn: 0.3346929	total: 2.25s	remaining: 2.3s
495:	learn: 0.3342032	total: 2.26s	remaining: 2.3s
496:	learn: 0.3336972	total: 2.26s	remaining: 2.29s
497:	learn: 0.3336357	total: 2.27s	remaining: 2.29s
498:	learn: 0.3333691	total: 2.27s	remaining: 2.28s
499:	learn: 0.3329063	total: 2.28s	remaining: 2.28s
500:	learn: 0.3326459	total: 2.28s	remaining: 2.27s
501:	learn: 0.3323511	total: 2.28s	remaining: 2.27s
502:	learn: 0.3319477	total: 2.29s	remaining: 2.26s
503:	learn: 0.3317364	total: 2.29s	remaining: 2.26s
504:	learn: 0.3312039	total: 2.3s	remaining: 2.25s
505:	learn: 0.3305307	total: 2.3s	remaining: 2.25s
506:	learn: 0.3300698	total: 2.31s	remaining: 2.24s
507:	learn: 0.3296915	total: 2.31s	remaining: 2.24s
508:	learn: 0.3294125	total: 2.31s	remaining: 2.23s
509:	learn: 0.3291011	total: 2.32s	remaining: 2.23s
510:	learn: 0.3289021	total: 2.32s	remaining: 2.22s
511:	learn: 0.3285137	total: 2.33s	remaining: 2.22s
512:	learn: 0.3278964	total: 2.33s	remaining: 2.21s
513:	learn: 0.3274555	total: 2.34s	remaining: 2.21s
514:	learn: 0.3270691	total: 2.34s	remaining: 2.2s
515:	learn: 0.3268052	total: 2.35s	remaining: 2.2s
516:	learn: 0.3263295	total: 2.35s	remaining: 2.19s
517:	learn: 0.3258524	total: 2.35s	remaining: 2.19s
518:	learn: 0.3255794	total: 2.36s	remaining: 2.19s
519:	learn: 0.3251552	total: 2.36s	remaining: 2.18s
520:	learn: 0.3247569	total: 2.37s	remaining: 2.18s
521:	learn: 0.3242420	total: 2.37s	remaining: 2.17s
522:	learn: 0.3240746	total: 2.38s	remaining: 2.17s
523:	learn: 0.3239854	total: 2.38s	remaining: 2.16s
524:	learn: 0.3239415	total: 2.38s	remaining: 2.16s
525:	learn: 0.3236346	total: 2.39s	remaining: 2.15s
526:	learn: 0.3233439	total: 2.39s	remaining: 2.15s
527:	learn: 0.3232261	total: 2.4s	remaining: 2.14s
528:	learn: 0.3226489	total: 2.4s	remaining: 2.14s
529:	learn: 0.3221208	total: 2.41s	remaining: 2.13s
530:	learn: 0.3217815	total: 2.41s	remaining: 2.13s
531:	learn: 0.3216486	total: 2.42s	remaining: 2.13s
532:	learn: 0.3213358	total: 2.42s	remaining: 2.12s
533:	learn: 0.3211304	total: 2.43s	remaining: 2.12s
534:	learn: 0.3210996	total: 2.43s	remaining: 2.11s
535:	learn: 0.3207515	total: 2.44s	remaining: 2.11s
536:	learn: 0.3204738	total: 2.44s	remaining: 2.1s
537:	learn: 0.3198882	total: 2.44s	remaining: 2.1s
538:	learn: 0.3195154	total: 2.45s	remaining: 2.09s
539:	learn: 0.3192952	total: 2.45s	remaining: 2.09s
540:	learn: 0.3190280	total: 2.46s	remaining: 2.08s
541:	learn: 0.3187129	total: 2.46s	remaining: 2.08s
542:	learn: 0.3181960	total: 2.46s	remaining: 2.07s
543:	learn: 0.3176606	total: 2.47s	remaining: 2.07s
544:	learn: 0.3170949	total: 2.47s	remaining: 2.06s
545:	learn: 0.3167459	total: 2.48s	remaining: 2.06s
546:	learn: 0.3163222	total: 2.48s	remaining: 2.06s
547:	learn: 0.3161420	total: 2.49s	remaining: 2.05s
548:	learn: 0.3157992	total: 2.49s	remaining: 2.05s
549:	learn: 0.3152920	total: 2.5s	remaining: 2.04s
550:	learn: 0.3148228	total: 2.5s	remaining: 2.04s
551:	learn: 0.3145167	total: 2.5s	remaining: 2.03s
552:	learn: 0.3139810	total: 2.51s	remaining: 2.03s
553:	learn: 0.3133923	total: 2.51s	remaining: 2.02s
554:	learn: 0.3129725	total: 2.52s	remaining: 2.02s
555:	learn: 0.3125777	total: 2.52s	remaining: 2.01s
556:	learn: 0.3123591	total: 2.53s	remaining: 2.01s
557:	learn: 0.3118976	total: 2.53s	remaining: 2s
558:	learn: 0.3117266	total: 2.54s	remaining: 2s
559:	learn: 0.3114944	total: 2.54s	remaining: 2s
560:	learn: 0.3112216	total: 2.54s	remaining: 1.99s
561:	learn: 0.3108621	total: 2.55s	remaining: 1.99s
562:	learn: 0.3104885	total: 2.55s	remaining: 1.98s
563:	learn: 0.3103672	total: 2.56s	remaining: 1.98s
564:	learn: 0.3099546	total: 2.56s	remaining: 1.97s
565:	learn: 0.3094861	total: 2.57s	remaining: 1.97s
566:	learn: 0.3091734	total: 2.57s	remaining: 1.96s
567:	learn: 0.3089148	total: 2.58s	remaining: 1.96s
568:	learn: 0.3086673	total: 2.58s	remaining: 1.95s
569:	learn: 0.3082695	total: 2.58s	remaining: 1.95s
570:	learn: 0.3078891	total: 2.59s	remaining: 1.94s
571:	learn: 0.3074927	total: 2.59s	remaining: 1.94s
572:	learn: 0.3069862	total: 2.6s	remaining: 1.94s
573:	learn: 0.3064039	total: 2.6s	remaining: 1.93s
574:	learn: 0.3061550	total: 2.6s	remaining: 1.93s
575:	learn: 0.3056908	total: 2.61s	remaining: 1.92s
576:	learn: 0.3052593	total: 2.61s	remaining: 1.92s
577:	learn: 0.3049534	total: 2.62s	remaining: 1.91s
578:	learn: 0.3044688	total: 2.62s	remaining: 1.91s
579:	learn: 0.3041772	total: 2.63s	remaining: 1.9s
580:	learn: 0.3037533	total: 2.63s	remaining: 1.9s
581:	learn: 0.3035700	total: 2.64s	remaining: 1.89s
582:	learn: 0.3031629	total: 2.64s	remaining: 1.89s
583:	learn: 0.3028838	total: 2.64s	remaining: 1.88s
584:	learn: 0.3023030	total: 2.65s	remaining: 1.88s
585:	learn: 0.3019222	total: 2.65s	remaining: 1.87s
586:	learn: 0.3015915	total: 2.66s	remaining: 1.87s
587:	learn: 0.3015704	total: 2.66s	remaining: 1.86s
588:	learn: 0.3009768	total: 2.67s	remaining: 1.86s
589:	learn: 0.3008180	total: 2.67s	remaining: 1.85s
590:	learn: 0.3005118	total: 2.67s	remaining: 1.85s
591:	learn: 0.3002814	total: 2.68s	remaining: 1.85s
592:	learn: 0.3000431	total: 2.68s	remaining: 1.84s
593:	learn: 0.2997323	total: 2.69s	remaining: 1.84s
594:	learn: 0.2994389	total: 2.69s	remaining: 1.83s
595:	learn: 0.2993389	total: 2.7s	remaining: 1.83s
596:	learn: 0.2988775	total: 2.7s	remaining: 1.82s
597:	learn: 0.2987598	total: 2.7s	remaining: 1.82s
598:	learn: 0.2984033	total: 2.71s	remaining: 1.81s
599:	learn: 0.2978245	total: 2.71s	remaining: 1.81s
600:	learn: 0.2974830	total: 2.72s	remaining: 1.8s
601:	learn: 0.2969405	total: 2.72s	remaining: 1.8s
602:	learn: 0.2966278	total: 2.73s	remaining: 1.79s
603:	learn: 0.2962533	total: 2.73s	remaining: 1.79s
604:	learn: 0.2959855	total: 2.73s	remaining: 1.78s
605:	learn: 0.2956097	total: 2.74s	remaining: 1.78s
606:	learn: 0.2953299	total: 2.75s	remaining: 1.78s
607:	learn: 0.2951260	total: 2.75s	remaining: 1.77s
608:	learn: 0.2947087	total: 2.75s	remaining: 1.77s
609:	learn: 0.2944823	total: 2.76s	remaining: 1.76s
610:	learn: 0.2943654	total: 2.76s	remaining: 1.76s
611:	learn: 0.2941610	total: 2.77s	remaining: 1.75s
612:	learn: 0.2938108	total: 2.77s	remaining: 1.75s
613:	learn: 0.2934358	total: 2.78s	remaining: 1.75s
614:	learn: 0.2931197	total: 2.78s	remaining: 1.74s
615:	learn: 0.2930426	total: 2.79s	remaining: 1.74s
616:	learn: 0.2928295	total: 2.79s	remaining: 1.73s
617:	learn: 0.2924921	total: 2.79s	remaining: 1.73s
618:	learn: 0.2921751	total: 2.8s	remaining: 1.72s
619:	learn: 0.2918991	total: 2.8s	remaining: 1.72s
620:	learn: 0.2913899	total: 2.81s	remaining: 1.71s
621:	learn: 0.2908501	total: 2.81s	remaining: 1.71s
622:	learn: 0.2905626	total: 2.82s	remaining: 1.7s
623:	learn: 0.2902904	total: 2.82s	remaining: 1.7s
624:	learn: 0.2900379	total: 2.83s	remaining: 1.7s
625:	learn: 0.2896903	total: 2.83s	remaining: 1.69s
626:	learn: 0.2892638	total: 2.83s	remaining: 1.69s
627:	learn: 0.2889578	total: 2.84s	remaining: 1.68s
628:	learn: 0.2887475	total: 2.84s	remaining: 1.68s
629:	learn: 0.2883429	total: 2.85s	remaining: 1.67s
630:	learn: 0.2881113	total: 2.85s	remaining: 1.67s
631:	learn: 0.2876183	total: 2.85s	remaining: 1.66s
632:	learn: 0.2875565	total: 2.86s	remaining: 1.66s
633:	learn: 0.2871677	total: 2.86s	remaining: 1.65s
634:	learn: 0.2868290	total: 2.87s	remaining: 1.65s
635:	learn: 0.2866803	total: 2.87s	remaining: 1.64s
636:	learn: 0.2864461	total: 2.88s	remaining: 1.64s
637:	learn: 0.2861437	total: 2.88s	remaining: 1.64s
638:	learn: 0.2858220	total: 2.89s	remaining: 1.63s
639:	learn: 0.2856065	total: 2.89s	remaining: 1.63s
640:	learn: 0.2853185	total: 2.89s	remaining: 1.62s
641:	learn: 0.2849693	total: 2.9s	remaining: 1.62s
642:	learn: 0.2844873	total: 2.9s	remaining: 1.61s
643:	learn: 0.2841484	total: 2.91s	remaining: 1.61s
644:	learn: 0.2838365	total: 2.91s	remaining: 1.6s
645:	learn: 0.2835952	total: 2.92s	remaining: 1.6s
646:	learn: 0.2831927	total: 2.92s	remaining: 1.59s
647:	learn: 0.2827722	total: 2.92s	remaining: 1.59s
648:	learn: 0.2824569	total: 2.93s	remaining: 1.58s
649:	learn: 0.2821450	total: 2.93s	remaining: 1.58s
650:	learn: 0.2820499	total: 2.94s	remaining: 1.57s
651:	learn: 0.2818476	total: 2.94s	remaining: 1.57s
652:	learn: 0.2815995	total: 2.95s	remaining: 1.57s
653:	learn: 0.2812118	total: 2.95s	remaining: 1.56s
654:	learn: 0.2809202	total: 2.96s	remaining: 1.56s
655:	learn: 0.2806815	total: 2.96s	remaining: 1.55s
656:	learn: 0.2802227	total: 2.96s	remaining: 1.55s
657:	learn: 0.2800624	total: 2.97s	remaining: 1.54s
658:	learn: 0.2795849	total: 2.97s	remaining: 1.54s
659:	learn: 0.2793078	total: 2.98s	remaining: 1.53s
660:	learn: 0.2791128	total: 2.98s	remaining: 1.53s
661:	learn: 0.2788399	total: 2.99s	remaining: 1.52s
662:	learn: 0.2788037	total: 2.99s	remaining: 1.52s
663:	learn: 0.2786668	total: 3s	remaining: 1.51s
664:	learn: 0.2784946	total: 3s	remaining: 1.51s
665:	learn: 0.2780830	total: 3s	remaining: 1.51s
666:	learn: 0.2778176	total: 3.01s	remaining: 1.5s
667:	learn: 0.2775362	total: 3.01s	remaining: 1.5s
668:	learn: 0.2771948	total: 3.02s	remaining: 1.49s
669:	learn: 0.2770151	total: 3.02s	remaining: 1.49s
670:	learn: 0.2766705	total: 3.03s	remaining: 1.48s
671:	learn: 0.2761992	total: 3.03s	remaining: 1.48s
672:	learn: 0.2761667	total: 3.04s	remaining: 1.48s
673:	learn: 0.2758894	total: 3.04s	remaining: 1.47s
674:	learn: 0.2754654	total: 3.04s	remaining: 1.47s
675:	learn: 0.2751694	total: 3.05s	remaining: 1.46s
676:	learn: 0.2747818	total: 3.05s	remaining: 1.46s
677:	learn: 0.2743284	total: 3.06s	remaining: 1.45s
678:	learn: 0.2738872	total: 3.06s	remaining: 1.45s
679:	learn: 0.2735984	total: 3.07s	remaining: 1.44s
680:	learn: 0.2732869	total: 3.07s	remaining: 1.44s
681:	learn: 0.2730972	total: 3.07s	remaining: 1.43s
682:	learn: 0.2728736	total: 3.08s	remaining: 1.43s
683:	learn: 0.2726755	total: 3.08s	remaining: 1.42s
684:	learn: 0.2723927	total: 3.09s	remaining: 1.42s
685:	learn: 0.2719884	total: 3.09s	remaining: 1.42s
686:	learn: 0.2715762	total: 3.1s	remaining: 1.41s
687:	learn: 0.2713579	total: 3.1s	remaining: 1.41s
688:	learn: 0.2711750	total: 3.1s	remaining: 1.4s
689:	learn: 0.2709947	total: 3.11s	remaining: 1.4s
690:	learn: 0.2707097	total: 3.11s	remaining: 1.39s
691:	learn: 0.2704260	total: 3.12s	remaining: 1.39s
692:	learn: 0.2701645	total: 3.12s	remaining: 1.38s
693:	learn: 0.2698393	total: 3.13s	remaining: 1.38s
694:	learn: 0.2695105	total: 3.13s	remaining: 1.37s
695:	learn: 0.2691283	total: 3.14s	remaining: 1.37s
696:	learn: 0.2688368	total: 3.14s	remaining: 1.36s
697:	learn: 0.2683466	total: 3.15s	remaining: 1.36s
698:	learn: 0.2680603	total: 3.15s	remaining: 1.36s
699:	learn: 0.2677699	total: 3.15s	remaining: 1.35s
700:	learn: 0.2674562	total: 3.16s	remaining: 1.35s
701:	learn: 0.2671464	total: 3.16s	remaining: 1.34s
702:	learn: 0.2668508	total: 3.17s	remaining: 1.34s
703:	learn: 0.2666627	total: 3.17s	remaining: 1.33s
704:	learn: 0.2663736	total: 3.18s	remaining: 1.33s
705:	learn: 0.2660888	total: 3.18s	remaining: 1.32s
706:	learn: 0.2658267	total: 3.18s	remaining: 1.32s
707:	learn: 0.2656145	total: 3.19s	remaining: 1.31s
708:	learn: 0.2654194	total: 3.19s	remaining: 1.31s
709:	learn: 0.2651461	total: 3.2s	remaining: 1.3s
710:	learn: 0.2647736	total: 3.2s	remaining: 1.3s
711:	learn: 0.2644030	total: 3.21s	remaining: 1.3s
712:	learn: 0.2641493	total: 3.21s	remaining: 1.29s
713:	learn: 0.2636034	total: 3.21s	remaining: 1.29s
714:	learn: 0.2633467	total: 3.22s	remaining: 1.28s
715:	learn: 0.2630884	total: 3.22s	remaining: 1.28s
716:	learn: 0.2627231	total: 3.23s	remaining: 1.27s
717:	learn: 0.2623737	total: 3.23s	remaining: 1.27s
718:	learn: 0.2620890	total: 3.23s	remaining: 1.26s
719:	learn: 0.2619557	total: 3.24s	remaining: 1.26s
720:	learn: 0.2615330	total: 3.25s	remaining: 1.25s
721:	learn: 0.2613678	total: 3.25s	remaining: 1.25s
722:	learn: 0.2609768	total: 3.25s	remaining: 1.25s
723:	learn: 0.2607843	total: 3.26s	remaining: 1.24s
724:	learn: 0.2604017	total: 3.26s	remaining: 1.24s
725:	learn: 0.2601656	total: 3.27s	remaining: 1.23s
726:	learn: 0.2598170	total: 3.27s	remaining: 1.23s
727:	learn: 0.2596272	total: 3.27s	remaining: 1.22s
728:	learn: 0.2594368	total: 3.28s	remaining: 1.22s
729:	learn: 0.2591430	total: 3.28s	remaining: 1.21s
730:	learn: 0.2588290	total: 3.29s	remaining: 1.21s
731:	learn: 0.2585789	total: 3.29s	remaining: 1.21s
732:	learn: 0.2584455	total: 3.3s	remaining: 1.2s
733:	learn: 0.2584104	total: 3.3s	remaining: 1.2s
734:	learn: 0.2582152	total: 3.3s	remaining: 1.19s
735:	learn: 0.2581397	total: 3.31s	remaining: 1.19s
736:	learn: 0.2578583	total: 3.31s	remaining: 1.18s
737:	learn: 0.2576197	total: 3.32s	remaining: 1.18s
738:	learn: 0.2573741	total: 3.32s	remaining: 1.17s
739:	learn: 0.2571082	total: 3.33s	remaining: 1.17s
740:	learn: 0.2568704	total: 3.33s	remaining: 1.16s
741:	learn: 0.2566573	total: 3.33s	remaining: 1.16s
742:	learn: 0.2564107	total: 3.34s	remaining: 1.16s
743:	learn: 0.2561631	total: 3.35s	remaining: 1.15s
744:	learn: 0.2561312	total: 3.35s	remaining: 1.15s
745:	learn: 0.2557099	total: 3.36s	remaining: 1.14s
746:	learn: 0.2554609	total: 3.36s	remaining: 1.14s
747:	learn: 0.2552453	total: 3.36s	remaining: 1.13s
748:	learn: 0.2549054	total: 3.37s	remaining: 1.13s
749:	learn: 0.2546371	total: 3.37s	remaining: 1.12s
750:	learn: 0.2543651	total: 3.38s	remaining: 1.12s
751:	learn: 0.2537761	total: 3.38s	remaining: 1.11s
752:	learn: 0.2535814	total: 3.39s	remaining: 1.11s
753:	learn: 0.2531956	total: 3.39s	remaining: 1.11s
754:	learn: 0.2531389	total: 3.39s	remaining: 1.1s
755:	learn: 0.2529298	total: 3.4s	remaining: 1.1s
756:	learn: 0.2527376	total: 3.4s	remaining: 1.09s
757:	learn: 0.2524734	total: 3.41s	remaining: 1.09s
758:	learn: 0.2521595	total: 3.41s	remaining: 1.08s
759:	learn: 0.2519621	total: 3.42s	remaining: 1.08s
760:	learn: 0.2516846	total: 3.42s	remaining: 1.07s
761:	learn: 0.2512891	total: 3.42s	remaining: 1.07s
762:	learn: 0.2510728	total: 3.43s	remaining: 1.06s
763:	learn: 0.2507176	total: 3.43s	remaining: 1.06s
764:	learn: 0.2503362	total: 3.44s	remaining: 1.05s
765:	learn: 0.2502037	total: 3.44s	remaining: 1.05s
766:	learn: 0.2498936	total: 3.45s	remaining: 1.05s
767:	learn: 0.2496633	total: 3.45s	remaining: 1.04s
768:	learn: 0.2493097	total: 3.46s	remaining: 1.04s
769:	learn: 0.2489150	total: 3.46s	remaining: 1.03s
770:	learn: 0.2486851	total: 3.46s	remaining: 1.03s
771:	learn: 0.2483194	total: 3.47s	remaining: 1.02s
772:	learn: 0.2479421	total: 3.47s	remaining: 1.02s
773:	learn: 0.2478561	total: 3.48s	remaining: 1.01s
774:	learn: 0.2475209	total: 3.48s	remaining: 1.01s
775:	learn: 0.2471628	total: 3.48s	remaining: 1.01s
776:	learn: 0.2469314	total: 3.49s	remaining: 1s
777:	learn: 0.2466425	total: 3.49s	remaining: 997ms
778:	learn: 0.2465065	total: 3.5s	remaining: 993ms
779:	learn: 0.2461220	total: 3.5s	remaining: 988ms
780:	learn: 0.2458115	total: 3.51s	remaining: 983ms
781:	learn: 0.2456707	total: 3.51s	remaining: 979ms
782:	learn: 0.2454929	total: 3.52s	remaining: 974ms
783:	learn: 0.2451706	total: 3.52s	remaining: 970ms
784:	learn: 0.2448523	total: 3.52s	remaining: 965ms
785:	learn: 0.2445772	total: 3.53s	remaining: 961ms
786:	learn: 0.2443875	total: 3.53s	remaining: 956ms
787:	learn: 0.2441339	total: 3.54s	remaining: 952ms
788:	learn: 0.2438773	total: 3.54s	remaining: 948ms
789:	learn: 0.2438140	total: 3.55s	remaining: 943ms
790:	learn: 0.2434810	total: 3.55s	remaining: 939ms
791:	learn: 0.2429906	total: 3.56s	remaining: 934ms
792:	learn: 0.2426931	total: 3.56s	remaining: 930ms
793:	learn: 0.2424043	total: 3.56s	remaining: 925ms
794:	learn: 0.2421210	total: 3.57s	remaining: 920ms
795:	learn: 0.2419921	total: 3.57s	remaining: 916ms
796:	learn: 0.2418543	total: 3.58s	remaining: 911ms
797:	learn: 0.2418231	total: 3.58s	remaining: 907ms
798:	learn: 0.2415865	total: 3.59s	remaining: 902ms
799:	learn: 0.2412327	total: 3.59s	remaining: 898ms
800:	learn: 0.2410130	total: 3.6s	remaining: 893ms
801:	learn: 0.2406898	total: 3.6s	remaining: 889ms
802:	learn: 0.2403997	total: 3.6s	remaining: 884ms
803:	learn: 0.2403092	total: 3.61s	remaining: 880ms
804:	learn: 0.2399602	total: 3.61s	remaining: 875ms
805:	learn: 0.2397046	total: 3.62s	remaining: 870ms
806:	learn: 0.2395811	total: 3.62s	remaining: 866ms
807:	learn: 0.2393554	total: 3.62s	remaining: 861ms
808:	learn: 0.2389797	total: 3.63s	remaining: 857ms
809:	learn: 0.2386822	total: 3.63s	remaining: 852ms
810:	learn: 0.2385272	total: 3.64s	remaining: 848ms
811:	learn: 0.2383253	total: 3.64s	remaining: 843ms
812:	learn: 0.2382769	total: 3.65s	remaining: 839ms
813:	learn: 0.2380034	total: 3.65s	remaining: 834ms
814:	learn: 0.2378123	total: 3.65s	remaining: 830ms
815:	learn: 0.2376812	total: 3.66s	remaining: 825ms
816:	learn: 0.2375900	total: 3.66s	remaining: 821ms
817:	learn: 0.2373746	total: 3.67s	remaining: 817ms
818:	learn: 0.2368950	total: 3.67s	remaining: 812ms
819:	learn: 0.2365626	total: 3.68s	remaining: 808ms
820:	learn: 0.2364160	total: 3.68s	remaining: 803ms
821:	learn: 0.2362066	total: 3.69s	remaining: 798ms
822:	learn: 0.2357433	total: 3.69s	remaining: 794ms
823:	learn: 0.2355107	total: 3.69s	remaining: 789ms
824:	learn: 0.2353031	total: 3.7s	remaining: 785ms
825:	learn: 0.2349189	total: 3.7s	remaining: 780ms
826:	learn: 0.2345722	total: 3.71s	remaining: 776ms
827:	learn: 0.2342914	total: 3.71s	remaining: 771ms
828:	learn: 0.2340122	total: 3.72s	remaining: 767ms
829:	learn: 0.2337871	total: 3.72s	remaining: 762ms
830:	learn: 0.2337141	total: 3.73s	remaining: 758ms
831:	learn: 0.2334017	total: 3.73s	remaining: 754ms
832:	learn: 0.2331498	total: 3.74s	remaining: 749ms
833:	learn: 0.2329182	total: 3.74s	remaining: 745ms
834:	learn: 0.2326496	total: 3.75s	remaining: 740ms
835:	learn: 0.2323721	total: 3.75s	remaining: 736ms
836:	learn: 0.2321663	total: 3.75s	remaining: 731ms
837:	learn: 0.2317915	total: 3.76s	remaining: 727ms
838:	learn: 0.2317042	total: 3.76s	remaining: 722ms
839:	learn: 0.2315415	total: 3.77s	remaining: 718ms
840:	learn: 0.2312144	total: 3.77s	remaining: 713ms
841:	learn: 0.2309861	total: 3.78s	remaining: 709ms
842:	learn: 0.2308992	total: 3.78s	remaining: 704ms
843:	learn: 0.2307248	total: 3.78s	remaining: 699ms
844:	learn: 0.2304123	total: 3.79s	remaining: 695ms
845:	learn: 0.2302900	total: 3.79s	remaining: 690ms
846:	learn: 0.2299298	total: 3.8s	remaining: 686ms
847:	learn: 0.2297649	total: 3.8s	remaining: 681ms
848:	learn: 0.2296314	total: 3.81s	remaining: 677ms
849:	learn: 0.2292939	total: 3.81s	remaining: 672ms
850:	learn: 0.2291602	total: 3.81s	remaining: 668ms
851:	learn: 0.2288865	total: 3.82s	remaining: 663ms
852:	learn: 0.2286853	total: 3.82s	remaining: 659ms
853:	learn: 0.2285428	total: 3.83s	remaining: 654ms
854:	learn: 0.2284197	total: 3.83s	remaining: 650ms
855:	learn: 0.2282277	total: 3.83s	remaining: 645ms
856:	learn: 0.2278907	total: 3.84s	remaining: 641ms
857:	learn: 0.2277383	total: 3.84s	remaining: 636ms
858:	learn: 0.2275067	total: 3.85s	remaining: 632ms
859:	learn: 0.2272363	total: 3.85s	remaining: 627ms
860:	learn: 0.2269635	total: 3.86s	remaining: 623ms
861:	learn: 0.2265979	total: 3.86s	remaining: 618ms
862:	learn: 0.2264905	total: 3.87s	remaining: 614ms
863:	learn: 0.2262386	total: 3.87s	remaining: 609ms
864:	learn: 0.2260311	total: 3.87s	remaining: 605ms
865:	learn: 0.2258230	total: 3.88s	remaining: 600ms
866:	learn: 0.2255610	total: 3.88s	remaining: 596ms
867:	learn: 0.2254607	total: 3.89s	remaining: 591ms
868:	learn: 0.2250433	total: 3.89s	remaining: 587ms
869:	learn: 0.2249648	total: 3.9s	remaining: 582ms
870:	learn: 0.2247922	total: 3.9s	remaining: 578ms
871:	learn: 0.2244233	total: 3.9s	remaining: 573ms
872:	learn: 0.2242406	total: 3.91s	remaining: 569ms
873:	learn: 0.2240229	total: 3.91s	remaining: 564ms
874:	learn: 0.2238258	total: 3.92s	remaining: 560ms
875:	learn: 0.2237571	total: 3.92s	remaining: 555ms
876:	learn: 0.2235548	total: 3.93s	remaining: 551ms
877:	learn: 0.2234135	total: 3.93s	remaining: 546ms
878:	learn: 0.2230922	total: 3.94s	remaining: 542ms
879:	learn: 0.2226509	total: 3.94s	remaining: 537ms
880:	learn: 0.2224404	total: 3.94s	remaining: 533ms
881:	learn: 0.2223705	total: 3.95s	remaining: 528ms
882:	learn: 0.2223260	total: 3.95s	remaining: 524ms
883:	learn: 0.2220318	total: 3.96s	remaining: 519ms
884:	learn: 0.2218552	total: 3.96s	remaining: 515ms
885:	learn: 0.2217687	total: 3.97s	remaining: 510ms
886:	learn: 0.2214940	total: 3.97s	remaining: 506ms
887:	learn: 0.2213258	total: 3.98s	remaining: 501ms
888:	learn: 0.2212884	total: 3.98s	remaining: 497ms
889:	learn: 0.2210636	total: 3.98s	remaining: 493ms
890:	learn: 0.2208620	total: 3.99s	remaining: 488ms
891:	learn: 0.2207268	total: 4s	remaining: 484ms
892:	learn: 0.2205061	total: 4s	remaining: 479ms
893:	learn: 0.2203278	total: 4s	remaining: 475ms
894:	learn: 0.2200449	total: 4.01s	remaining: 470ms
895:	learn: 0.2197147	total: 4.01s	remaining: 466ms
896:	learn: 0.2195113	total: 4.02s	remaining: 461ms
897:	learn: 0.2192671	total: 4.02s	remaining: 457ms
898:	learn: 0.2192258	total: 4.03s	remaining: 452ms
899:	learn: 0.2189515	total: 4.03s	remaining: 448ms
900:	learn: 0.2186505	total: 4.03s	remaining: 443ms
901:	learn: 0.2184193	total: 4.04s	remaining: 439ms
902:	learn: 0.2183672	total: 4.04s	remaining: 434ms
903:	learn: 0.2181273	total: 4.05s	remaining: 430ms
904:	learn: 0.2178674	total: 4.05s	remaining: 425ms
905:	learn: 0.2176005	total: 4.05s	remaining: 421ms
906:	learn: 0.2174875	total: 4.06s	remaining: 416ms
907:	learn: 0.2171777	total: 4.06s	remaining: 412ms
908:	learn: 0.2169544	total: 4.07s	remaining: 407ms
909:	learn: 0.2167341	total: 4.07s	remaining: 403ms
910:	learn: 0.2166133	total: 4.08s	remaining: 398ms
911:	learn: 0.2164535	total: 4.08s	remaining: 394ms
912:	learn: 0.2164128	total: 4.08s	remaining: 389ms
913:	learn: 0.2162686	total: 4.09s	remaining: 385ms
914:	learn: 0.2160413	total: 4.09s	remaining: 380ms
915:	learn: 0.2158222	total: 4.1s	remaining: 376ms
916:	learn: 0.2157209	total: 4.1s	remaining: 371ms
917:	learn: 0.2155155	total: 4.11s	remaining: 367ms
918:	learn: 0.2154142	total: 4.11s	remaining: 362ms
919:	learn: 0.2151079	total: 4.12s	remaining: 358ms
920:	learn: 0.2149705	total: 4.12s	remaining: 353ms
921:	learn: 0.2149322	total: 4.12s	remaining: 349ms
922:	learn: 0.2148807	total: 4.13s	remaining: 344ms
923:	learn: 0.2146427	total: 4.13s	remaining: 340ms
924:	learn: 0.2145465	total: 4.14s	remaining: 335ms
925:	learn: 0.2143896	total: 4.14s	remaining: 331ms
926:	learn: 0.2141042	total: 4.15s	remaining: 327ms
927:	learn: 0.2140653	total: 4.15s	remaining: 322ms
928:	learn: 0.2138073	total: 4.16s	remaining: 318ms
929:	learn: 0.2135790	total: 4.16s	remaining: 313ms
930:	learn: 0.2135487	total: 4.16s	remaining: 309ms
931:	learn: 0.2134147	total: 4.17s	remaining: 304ms
932:	learn: 0.2132543	total: 4.17s	remaining: 300ms
933:	learn: 0.2131319	total: 4.18s	remaining: 295ms
934:	learn: 0.2128832	total: 4.18s	remaining: 291ms
935:	learn: 0.2125868	total: 4.18s	remaining: 286ms
936:	learn: 0.2124035	total: 4.19s	remaining: 282ms
937:	learn: 0.2121759	total: 4.19s	remaining: 277ms
938:	learn: 0.2120445	total: 4.2s	remaining: 273ms
939:	learn: 0.2118077	total: 4.2s	remaining: 268ms
940:	learn: 0.2114668	total: 4.21s	remaining: 264ms
941:	learn: 0.2111545	total: 4.21s	remaining: 259ms
942:	learn: 0.2111246	total: 4.21s	remaining: 255ms
943:	learn: 0.2109490	total: 4.22s	remaining: 250ms
944:	learn: 0.2107927	total: 4.22s	remaining: 246ms
945:	learn: 0.2105585	total: 4.23s	remaining: 241ms
946:	learn: 0.2105239	total: 4.23s	remaining: 237ms
947:	learn: 0.2104112	total: 4.24s	remaining: 232ms
948:	learn: 0.2101941	total: 4.24s	remaining: 228ms
949:	learn: 0.2101563	total: 4.24s	remaining: 223ms
950:	learn: 0.2101322	total: 4.25s	remaining: 219ms
951:	learn: 0.2098230	total: 4.25s	remaining: 214ms
952:	learn: 0.2097301	total: 4.26s	remaining: 210ms
953:	learn: 0.2095164	total: 4.26s	remaining: 206ms
954:	learn: 0.2093993	total: 4.27s	remaining: 201ms
955:	learn: 0.2091224	total: 4.27s	remaining: 197ms
956:	learn: 0.2089994	total: 4.28s	remaining: 192ms
957:	learn: 0.2088083	total: 4.28s	remaining: 188ms
958:	learn: 0.2087013	total: 4.29s	remaining: 183ms
959:	learn: 0.2084003	total: 4.29s	remaining: 179ms
960:	learn: 0.2082851	total: 4.29s	remaining: 174ms
961:	learn: 0.2080820	total: 4.3s	remaining: 170ms
962:	learn: 0.2078811	total: 4.3s	remaining: 165ms
963:	learn: 0.2076056	total: 4.31s	remaining: 161ms
964:	learn: 0.2074108	total: 4.31s	remaining: 156ms
965:	learn: 0.2070406	total: 4.32s	remaining: 152ms
966:	learn: 0.2067699	total: 4.32s	remaining: 148ms
967:	learn: 0.2063896	total: 4.33s	remaining: 143ms
968:	learn: 0.2060907	total: 4.33s	remaining: 139ms
969:	learn: 0.2059021	total: 4.33s	remaining: 134ms
970:	learn: 0.2056345	total: 4.34s	remaining: 130ms
971:	learn: 0.2053408	total: 4.34s	remaining: 125ms
972:	learn: 0.2050676	total: 4.35s	remaining: 121ms
973:	learn: 0.2048304	total: 4.35s	remaining: 116ms
974:	learn: 0.2046749	total: 4.36s	remaining: 112ms
975:	learn: 0.2044718	total: 4.36s	remaining: 107ms
976:	learn: 0.2041876	total: 4.37s	remaining: 103ms
977:	learn: 0.2039090	total: 4.37s	remaining: 98.3ms
978:	learn: 0.2037805	total: 4.38s	remaining: 93.9ms
979:	learn: 0.2034705	total: 4.38s	remaining: 89.4ms
980:	learn: 0.2032025	total: 4.38s	remaining: 84.9ms
981:	learn: 0.2030209	total: 4.39s	remaining: 80.4ms
982:	learn: 0.2027762	total: 4.39s	remaining: 76ms
983:	learn: 0.2026031	total: 4.4s	remaining: 71.5ms
984:	learn: 0.2023929	total: 4.4s	remaining: 67ms
985:	learn: 0.2021011	total: 4.41s	remaining: 62.6ms
986:	learn: 0.2018960	total: 4.41s	remaining: 58.1ms
987:	learn: 0.2017884	total: 4.41s	remaining: 53.6ms
988:	learn: 0.2014352	total: 4.42s	remaining: 49.1ms
989:	learn: 0.2012731	total: 4.42s	remaining: 44.7ms
990:	learn: 0.2010667	total: 4.43s	remaining: 40.2ms
991:	learn: 0.2008429	total: 4.43s	remaining: 35.7ms
992:	learn: 0.2006669	total: 4.43s	remaining: 31.3ms
993:	learn: 0.2005137	total: 4.44s	remaining: 26.8ms
994:	learn: 0.2001325	total: 4.44s	remaining: 22.3ms
995:	learn: 0.1999855	total: 4.45s	remaining: 17.9ms
996:	learn: 0.1996729	total: 4.45s	remaining: 13.4ms
997:	learn: 0.1994904	total: 4.46s	remaining: 8.93ms
998:	learn: 0.1993044	total: 4.46s	remaining: 4.47ms
999:	learn: 0.1989547	total: 4.47s	remaining: 0us
In [28]:
df_compare
Out[28]:
Model Mean Squared Error Mean Absolute Error R2 Score Training Time (s)
0 Ridge Regression 0.746614 0.598242 0.98175 0.002745
1 Linear Regression 0.77558 0.613905 0.981042 0.015371
2 Lasso Regression 1.066347 0.782536 0.973934 0.104589
3 Elastic Net 1.070319 0.718474 0.973837 0.108071
4 Gradient Boosting 87.489334 7.501632 -1.138585 2.444588
5 AdaBoost 88.43475 7.561282 -1.161695 0.786036
6 Random Forest 90.124875 7.670387 -1.203008 4.302596
7 Decision Tree 91.634349 7.746372 -1.239906 0.079427
11 CatBoost 93.180853 7.853729 -1.277709 4.704044
8 XGBoost 94.682514 7.893578 -1.314415 0.665853
9 K-Neighbors Regressor 330.959365 16.750109 -7.089956 0.000767
10 SVR 349.552891 17.672316 -7.544455 0.150409

We trained a variety of regression models to predict stock prices, including Linear Regression, Ridge Regression, Lasso Regression, Elastic Net, Support Vector Regression (SVR), K-Neighbors Regressor, Decision Tree, Random Forest, Gradient Boosting, AdaBoost, XGBoost, and CatBoost. The training results show a variety of metrics for different regression models, which are useful in evaluating their performance. Let's break down what each metric means and its significance:

  • Mean Squared Error (MSE): MSE is a measure of the average squared difference between the actual and predicted values. The lower the MSE, the better the model's performance. A high MSE, as seen in models like SVR, Decision Tree, and Random Forest, indicates poor model performance.
  • Mean Absolute Error (MAE): MAE measures the average magnitude of errors between predicted and actual values, without considering their direction (i.e., no squaring or rooting). Similar to MSE, a lower MAE is better. High MAE values in models like SVR suggest significant average errors in predictions.
  • R2 Score: The R2 Score, or the coefficient of determination, measures how well the regression predictions approximate the real data points. An R2 Score of 1 indicates perfect prediction. Positive values close to 1 indicate good model performance. The Linear and Ridge Regression models have high R2 scores, implying they fit the data well. Negative R2 scores, as seen in SVR, K-Neighbors Regressor, Decision Tree, Random Forest, Gradient Boosting, and AdaBoost, indicate that these models perform worse than a simple model that would always predict the mean value of the target variable.
  • Training Time (s): This measures how long it takes for each model to be trained. Shorter training times are generally preferred, especially when dealing with large datasets or in scenarios where model training needs to be performed frequently. The Random Forest model, for example, has a significantly longer training time compared to others, which might be a drawback in time-sensitive applications.
  • Analysis of Results: Linear Regression and Ridge Regression show the best performance in terms of MSE, MAE, and R2 Score. They are also efficient with relatively low training times. Lasso Regression performs moderately well, with a reasonable R2 score but much higher MSE and MAE than Linear and Ridge Regression. Models like SVR, K-Neighbors Regressor, Decision Tree, Random Forest, Gradient Boosting, and AdaBoost have very poor R2 scores, indicating that they are not suitable for this particular dataset or require parameter tuning. The Random Forest model, despite its popularity for handling complex datasets, shows poor performance and extremely high training time in this case, which might be due to overfitting, the need for hyperparameter tuning, or the nature of the dataset.
  • Conclusion: The choice of the best model depends on a balance of these metrics and the specific requirements of the application (e.g., prediction accuracy vs. training time). For this dataset, Linear Regression and Ridge Regression seem to be the most effective. However, for practical deployment, one should also consider factors like model complexity, interpretability, and how the model will generalize to unseen data.

3. Linear Regression Model¶

3.a Linear Regression Model with All Features¶

In [29]:
# Train the linear regression model
lr_model_base = LinearRegression()
lr_model_base.fit(X_train_scaled, y_train)

# Make predictions on the scaled test set
lr_pred_base = lr_model_base.predict(X_test_scaled)
In [30]:
prediction_df = pd.DataFrame()
prediction_df['date'] = df[df.date.dt.year>=2020]['date']
prediction_df['y_test'] = y_test
prediction_df['lr_pred_base'] = lr_pred_base

prediction_df.head()
Out[30]:
date y_test lr_pred_base
1729 2020-01-02 54.240002 54.157799
1730 2020-01-03 54.150002 54.553547
1731 2020-01-06 53.919998 54.336899
1732 2020-01-07 54.049999 53.907121
1733 2020-01-08 54.189999 54.192608
In [31]:
lr_score_base = evaluate_regression_model(y_test, lr_pred_base)
Mean Squared Error (MSE): 0.776
Root Mean Squared Error (RMSE): 0.881
Mean Absolute Error (MAE): 0.614
R-squared (R2): 0.981
In [32]:
lr_score_base
Out[32]:
{'MSE': 0.7755799343709945,
 'RMSE': 0.880670162076015,
 'MAE': 0.6139047642970673,
 'R2': 0.9810417591179742}
In [33]:
plot_regression_accuracy(y_test, lr_pred_base)
In [34]:
plot_predictions(df,lr_pred_base)
In [35]:
lr_base_feature_importance = plot_feature_importance(lr_model_base,X_train,20)
In [36]:
lr_base_feature_importance[:15]
Out[36]:
Feature Importance
0 adj close_10d_avg 59.927343
1 close_10d_avg 52.842967
2 close_15d_avg 32.217386
3 ema_9 31.326586
4 adj close_15d_avg 29.982687
5 close_5d_avg 17.929065
6 sma_5 13.495880
7 adj close_5d_avg 12.461717
8 adj close_3d_avg 10.395039
9 close_1d_ago 9.861051
10 adj close_7d_ago 9.733592
11 adj close_1d_ago 9.044414
12 close_7d_ago 7.605479
13 sma_15 6.911237
14 adj close_14d_ago 5.986732

3.b. Linear Regression Model with top 20 Features¶

In [37]:
keep_cols20 = lr_base_feature_importance[:20]['Feature'].tolist()

X_train20 = X_train[keep_cols20]
X_test20 = X_test[keep_cols20]

scaler = StandardScaler()
X_train_scaled20 = scaler.fit_transform(X_train20)
X_test_scaled20 = scaler.transform(X_test20)
In [38]:
# Train the linear regression model
lr_model20 = LinearRegression()
lr_model20.fit(X_train_scaled20, y_train)

# Make predictions on the scaled test set
lr_pred20 = lr_model20.predict(X_test_scaled20)
lr_score20 = evaluate_regression_model(y_test, lr_pred20)
Mean Squared Error (MSE): 0.768
Root Mean Squared Error (RMSE): 0.877
Mean Absolute Error (MAE): 0.613
R-squared (R2): 0.981
In [39]:
prediction_df['lr_pred20'] = lr_pred20

prediction_df.head()
Out[39]:
date y_test lr_pred_base lr_pred20
1729 2020-01-02 54.240002 54.157799 54.239022
1730 2020-01-03 54.150002 54.553547 54.520826
1731 2020-01-06 53.919998 54.336899 54.065422
1732 2020-01-07 54.049999 53.907121 54.067429
1733 2020-01-08 54.189999 54.192608 53.987340
In [40]:
lr_score20
Out[40]:
{'MSE': 0.7682885573217368,
 'RMSE': 0.8765207112908039,
 'MAE': 0.6133462584657988,
 'R2': 0.9812199892092073}
In [41]:
plot_feature_importance(lr_model20,X_train20,20)
Out[41]:
Feature Importance
0 adj close_10d_avg 39.701594
1 close_10d_avg 34.492844
2 ema_9 16.380613
3 adj close_7d_avg 14.543602
4 sma_5 11.032622
5 close_15d_avg 10.687774
6 sma_15 9.954755
7 adj close_5d_avg 8.830203
8 adj close_7d_ago 8.207036
9 adj close_15d_avg 7.303476
10 close_7d_ago 7.068989
11 adj close_1d_ago 6.587839
12 close_5d_avg 6.427914
13 close_1d_ago 4.837201
14 adj close_3d_avg 4.708264
15 close_3d_ago 4.101885
16 adj close_3d_ago 3.394063
17 low_5d_avg 1.134743
18 low_10d_avg 0.507845
19 adj close_14d_ago 0.220884

3.c Linear Regression Model with top 15 Features¶

In [42]:
keep_cols15 = lr_base_feature_importance[:15]['Feature'].tolist()

X_train15 = X_train[keep_cols15]
X_test15 = X_test[keep_cols15]

scaler = StandardScaler()
X_train_scaled15 = scaler.fit_transform(X_train15)
X_test_scaled15 = scaler.transform(X_test15)
In [43]:
# Train the linear regression model
lr_model15 = LinearRegression()
lr_model15.fit(X_train_scaled15, y_train)

# Make predictions on the scaled test set
lr_pred15 = lr_model15.predict(X_test_scaled15)
lr_score15 = evaluate_regression_model(y_test, lr_pred15)
Mean Squared Error (MSE): 0.797
Root Mean Squared Error (RMSE): 0.893
Mean Absolute Error (MAE): 0.626
R-squared (R2): 0.981
In [44]:
prediction_df['lr_pred15'] = lr_pred15

prediction_df.head()
Out[44]:
date y_test lr_pred_base lr_pred20 lr_pred15
1729 2020-01-02 54.240002 54.157799 54.239022 54.554907
1730 2020-01-03 54.150002 54.553547 54.520826 54.558027
1731 2020-01-06 53.919998 54.336899 54.065422 54.148986
1732 2020-01-07 54.049999 53.907121 54.067429 53.903359
1733 2020-01-08 54.189999 54.192608 53.987340 53.942897
In [45]:
lr_score15
Out[45]:
{'MSE': 0.7968925999367115,
 'RMSE': 0.8926884114497686,
 'MAE': 0.6261544545364333,
 'R2': 0.9805207932836008}
In [46]:
plot_feature_importance(lr_model15,X_train15,15)
Out[46]:
Feature Importance
0 close_10d_avg 29.471317
1 adj close_10d_avg 24.455782
2 ema_9 21.480216
3 adj close_5d_avg 16.328249
4 adj close_1d_ago 10.320838
5 adj close_3d_avg 9.162211
6 close_1d_ago 7.864891
7 adj close_7d_ago 7.162200
8 sma_15 6.734176
9 close_7d_ago 6.356886
10 close_5d_avg 4.707638
11 close_15d_avg 3.343602
12 sma_5 2.379653
13 adj close_15d_avg 2.216233
14 adj close_14d_ago 0.415949

3.d. Linear Regression Model with top 10 Features¶

In [47]:
keep_cols10 = lr_base_feature_importance[:10]['Feature'].tolist()

X_train10 = X_train[keep_cols10]
X_test10 = X_test[keep_cols10]

scaler = StandardScaler()
X_train_scaled10 = scaler.fit_transform(X_train10)
X_test_scaled10 = scaler.transform(X_test10)
In [48]:
# Train the linear regression model
lr_model10 = LinearRegression()
lr_model10.fit(X_train_scaled10, y_train)

# Make predictions on the scaled test set
lr_pred10 = lr_model10.predict(X_test_scaled10)
lr_score10 = evaluate_regression_model(y_test, lr_pred10)
Mean Squared Error (MSE): 0.786
Root Mean Squared Error (RMSE): 0.887
Mean Absolute Error (MAE): 0.626
R-squared (R2): 0.981
In [49]:
prediction_df['lr_pred10'] = lr_pred10

prediction_df.head()
Out[49]:
date y_test lr_pred_base lr_pred20 lr_pred15 lr_pred10
1729 2020-01-02 54.240002 54.157799 54.239022 54.554907 54.590333
1730 2020-01-03 54.150002 54.553547 54.520826 54.558027 54.511431
1731 2020-01-06 53.919998 54.336899 54.065422 54.148986 54.126188
1732 2020-01-07 54.049999 53.907121 54.067429 53.903359 53.992480
1733 2020-01-08 54.189999 54.192608 53.987340 53.942897 54.081508
In [50]:
lr_score10
Out[50]:
{'MSE': 0.7864993593280865,
 'RMSE': 0.8868479911056271,
 'MAE': 0.6260216655405225,
 'R2': 0.9807748451875646}
In [51]:
plot_feature_importance(lr_model10,X_train10,10)
Out[51]:
Feature Importance
0 ema_9 16.842727
1 close_5d_avg 14.158982
2 adj close_3d_avg 8.615665
3 adj close_5d_avg 6.493537
4 close_15d_avg 6.467632
5 close_10d_avg 6.069412
6 sma_5 3.130954
7 adj close_10d_avg 1.666197
8 close_1d_ago 0.905094
9 adj close_15d_avg 0.029654

4. Ridge Regression Parameter Fine Tuning¶

4.a. Ridge Regression with All Features¶

In [52]:
ridge_model = Ridge()

    # Define the hyperparameter grid to search
param_grid = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

    # Perform GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=ridge_model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train_scaled, y_train)

    # Get the best model
best_ridge_model = grid_search.best_estimator_

    # Make predictions on the test set
ridge_pred_base = best_ridge_model.predict(X_test_scaled)

    # Evaluate the best model
mse = mean_squared_error(y_test, ridge_pred_base)
rmse = mean_squared_error(y_test, ridge_pred_base, squared=False)
mae = mean_absolute_error(y_test, ridge_pred_base)
r2 = r2_score(y_test, ridge_pred_base)

print("Best Ridge Regression Model:")
print(f"Best alpha: {best_ridge_model.alpha}")
print(f'Root Mean Squared Error (RMSE): {np.round(rmse,3)}')
print(f"Mean Squared Error: {np.round(mse,3)}")
print(f"Mean Absolute Error: {np.round(mae,3)}")
print(f"R2 Score: {np.round(r2,3)}")

ridge_score = {
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'R2': r2
    }
Best Ridge Regression Model:
Best alpha: 0.001
Root Mean Squared Error (RMSE): 0.871
Mean Squared Error: 0.759
Mean Absolute Error: 0.606
R2 Score: 0.981
In [53]:
ridge_base_feature_importance = plot_feature_importance(best_ridge_model,X_train,20)
In [54]:
ridge_base_feature_importance[:20]
Out[54]:
Feature Importance
0 close_5d_avg 19.627386
1 ema_9 18.897025
2 sma_5 18.403314
3 adj close_5d_avg 11.844801
4 close_10d_avg 9.872194
5 adj close_10d_avg 9.389002
6 close_15d_avg 7.887685
7 close_1d_ago 7.038813
8 close_7d_avg 6.958126
9 adj close_1d_ago 6.600528
10 close_5d_ago 6.008206
11 sma_15 5.538994
12 low_5d_avg 4.965283
13 low_10d_avg 4.840114
14 open_5d_avg 4.446916
15 high_5d_avg 3.714418
16 open_10d_avg 3.486739
17 high_30d_avg 2.896473
18 close_3d_ago 2.893431
19 open_15d_avg 2.892245
In [55]:
prediction_df['ridge_pred_base'] = ridge_pred_base

prediction_df.head()
Out[55]:
date y_test lr_pred_base lr_pred20 lr_pred15 lr_pred10 ridge_pred_base
1729 2020-01-02 54.240002 54.157799 54.239022 54.554907 54.590333 54.205840
1730 2020-01-03 54.150002 54.553547 54.520826 54.558027 54.511431 54.542331
1731 2020-01-06 53.919998 54.336899 54.065422 54.148986 54.126188 54.345656
1732 2020-01-07 54.049999 53.907121 54.067429 53.903359 53.992480 53.873857
1733 2020-01-08 54.189999 54.192608 53.987340 53.942897 54.081508 54.195624

4.b. Ridge Regression with top 20 Features¶

In [56]:
keep_cols20 = ridge_base_feature_importance[:20]['Feature'].tolist()

X_train20 = X_train[keep_cols20]
X_test20 = X_test[keep_cols20]

scaler = StandardScaler()
X_train_scaled20 = scaler.fit_transform(X_train20)
X_test_scaled20 = scaler.transform(X_test20)

# Train model
ridge_model20 = Ridge(alpha=0.001)
ridge_model20.fit(X_train_scaled20, y_train)


# Make predictions on the scaled test set
ridge_pred20 = ridge_model20.predict(X_test_scaled20)
ridge_score20 = evaluate_regression_model(y_test, ridge_pred20)
Mean Squared Error (MSE): 0.73
Root Mean Squared Error (RMSE): 0.854
Mean Absolute Error (MAE): 0.586
R-squared (R2): 0.982
In [57]:
plot_feature_importance(ridge_model20,X_train20,20)
Out[57]:
Feature Importance
0 sma_5 25.441510
1 close_5d_avg 22.435160
2 adj close_5d_avg 11.490239
3 adj close_1d_ago 5.671591
4 adj close_10d_avg 5.507894
5 close_5d_ago 5.147037
6 close_1d_ago 4.554521
7 close_10d_avg 4.509604
8 close_15d_avg 3.719596
9 low_5d_avg 3.022988
10 sma_15 2.915010
11 ema_9 2.899160
12 low_10d_avg 1.985008
13 high_5d_avg 1.745675
14 open_10d_avg 1.590243
15 open_5d_avg 1.561036
16 open_15d_avg 1.264441
17 close_3d_ago 0.798379
18 close_7d_avg 0.640924
19 high_30d_avg 0.200250
In [58]:
prediction_df['ridge_pred20'] = ridge_pred20

prediction_df.head()
Out[58]:
date y_test lr_pred_base lr_pred20 lr_pred15 lr_pred10 ridge_pred_base ridge_pred20
1729 2020-01-02 54.240002 54.157799 54.239022 54.554907 54.590333 54.205840 54.349344
1730 2020-01-03 54.150002 54.553547 54.520826 54.558027 54.511431 54.542331 54.381596
1731 2020-01-06 53.919998 54.336899 54.065422 54.148986 54.126188 54.345656 54.233187
1732 2020-01-07 54.049999 53.907121 54.067429 53.903359 53.992480 53.873857 54.036503
1733 2020-01-08 54.189999 54.192608 53.987340 53.942897 54.081508 54.195624 54.059649

5. Lasso Regression Model Parameter Fine Tuning¶

5.a Lasso Regression with All Features¶

In [59]:
lasso_model = Lasso()

param_grid = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

    # Perform GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=lasso_model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train_scaled, y_train)

    # Get the best model
best_lasso_model = grid_search.best_estimator_

    # Make predictions on the test set
lasso_pred_base = best_lasso_model.predict(X_test_scaled)

    # Evaluate the best model
mse = mean_squared_error(y_test, lasso_pred_base)
rmse = mean_squared_error(y_test, lasso_pred_base, squared=False)
mae = mean_absolute_error(y_test, lasso_pred_base)
r2 = r2_score(y_test, lasso_pred_base)

print("Best Lasso Regression Model:")
print(f"Best alpha: {best_lasso_model.alpha}")
print(f'Root Mean Squared Error (RMSE): {np.round(rmse,3)}')
print(f"Mean Squared Error: {np.round(mse,3)}")
print(f"Mean Absolute Error: {np.round(mae,3)}")
print(f"R2 Score: {np.round(r2,3)}")

lasso_score = {
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'R2': r2
    }
Best Lasso Regression Model:
Best alpha: 0.001
Root Mean Squared Error (RMSE): 0.97
Mean Squared Error: 0.94
Mean Absolute Error: 0.663
R2 Score: 0.977
In [60]:
lasso_base_feature_importance = plot_feature_importance(best_lasso_model,X_train,20)
In [61]:
lasso_base_feature_importance[:20]
Out[61]:
Feature Importance
0 ema_9 4.177865
1 close_3d_avg 1.314731
2 macd 1.312350
3 macd_signal 1.160117
4 sma_15 1.006266
5 adj close_3d_avg 0.822478
6 low_1d_ago 0.722646
7 close_3d_ago 0.583764
8 rsi 0.543082
9 open_3d_ago 0.510853
10 sma_30 0.492814
11 high_1d_ago 0.466770
12 adj close_3d_ago 0.422750
13 low_15d_avg 0.404237
14 open_1d_ago 0.365671
15 high_14d_ago 0.307277
16 low_30d_avg 0.291303
17 sma_10 0.255037
18 open_30d_avg 0.241245
19 high_30d_avg 0.227913
In [62]:
prediction_df['lasso_pred_base'] = lasso_pred_base

prediction_df.head()
Out[62]:
date y_test lr_pred_base lr_pred20 lr_pred15 lr_pred10 ridge_pred_base ridge_pred20 lasso_pred_base
1729 2020-01-02 54.240002 54.157799 54.239022 54.554907 54.590333 54.205840 54.349344 54.496024
1730 2020-01-03 54.150002 54.553547 54.520826 54.558027 54.511431 54.542331 54.381596 54.190970
1731 2020-01-06 53.919998 54.336899 54.065422 54.148986 54.126188 54.345656 54.233187 53.975755
1732 2020-01-07 54.049999 53.907121 54.067429 53.903359 53.992480 53.873857 54.036503 53.828707
1733 2020-01-08 54.189999 54.192608 53.987340 53.942897 54.081508 54.195624 54.059649 54.002350

5.b Lasso Regression with Top 20 Features¶

In [63]:
keep_cols20 = lasso_base_feature_importance[:20]['Feature'].tolist()

X_train20 = X_train[keep_cols20]
X_test20 = X_test[keep_cols20]

scaler = StandardScaler()
X_train_scaled20 = scaler.fit_transform(X_train20)
X_test_scaled20 = scaler.transform(X_test20)

# Train model
lasso_model20 = Lasso(alpha=0.001)
lasso_model20.fit(X_train_scaled20, y_train)


# Make predictions on the scaled test set
lasso_pred20 = lasso_model20.predict(X_test_scaled20)
lasso_score20 = evaluate_regression_model(y_test, lasso_pred20)
Mean Squared Error (MSE): 0.948
Root Mean Squared Error (RMSE): 0.974
Mean Absolute Error (MAE): 0.665
R-squared (R2): 0.977
In [64]:
plot_feature_importance(lasso_model20,X_train20,20)
Out[64]:
Feature Importance
0 ema_9 4.017287
1 close_3d_avg 2.790781
2 low_1d_ago 1.047406
3 close_3d_ago 1.034968
4 sma_30 1.029511
5 low_15d_avg 0.997798
6 macd 0.968169
7 macd_signal 0.880009
8 high_14d_ago 0.568316
9 high_1d_ago 0.533765
10 rsi 0.496376
11 sma_10 0.423288
12 adj close_3d_avg 0.228550
13 open_3d_ago 0.204478
14 open_1d_ago 0.139447
15 low_30d_avg 0.025033
16 sma_15 0.000000
17 adj close_3d_ago 0.000000
18 open_30d_avg 0.000000
19 high_30d_avg 0.000000
In [65]:
prediction_df['lasso_pred20'] = lasso_pred20

prediction_df.head()
Out[65]:
date y_test lr_pred_base lr_pred20 lr_pred15 lr_pred10 ridge_pred_base ridge_pred20 lasso_pred_base lasso_pred20
1729 2020-01-02 54.240002 54.157799 54.239022 54.554907 54.590333 54.205840 54.349344 54.496024 54.501734
1730 2020-01-03 54.150002 54.553547 54.520826 54.558027 54.511431 54.542331 54.381596 54.190970 54.262946
1731 2020-01-06 53.919998 54.336899 54.065422 54.148986 54.126188 54.345656 54.233187 53.975755 54.001497
1732 2020-01-07 54.049999 53.907121 54.067429 53.903359 53.992480 53.873857 54.036503 53.828707 53.859163
1733 2020-01-08 54.189999 54.192608 53.987340 53.942897 54.081508 54.195624 54.059649 54.002350 53.989285

6. Elastic Net Regression Model Parameter Fine Tuning¶

6.a. Elastic Net with All Features¶

In [66]:
elastic_net_model = ElasticNet()

    # Define the hyperparameter grid to search
param_grid = {
        'alpha': [0.001, 0.01, 0.1, 1, 10, 100],
        'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]
    }

    # Perform GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=elastic_net_model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train_scaled, y_train)

    # Get the best model
best_elastic_net_model = grid_search.best_estimator_

    # Make predictions on the test set
elastic_pred_base = best_elastic_net_model.predict(X_test_scaled)

    # Evaluate the best model
mse = mean_squared_error(y_test, elastic_pred_base)
rmse = mean_squared_error(y_test, elastic_pred_base, squared=False)
mae = mean_absolute_error(y_test, elastic_pred_base)
r2 = r2_score(y_test, elastic_pred_base)

print("Best Elastic Net Model:")
print(f"Best alpha: {best_elastic_net_model.alpha}")
print(f"Best l1_ratio: {best_elastic_net_model.l1_ratio}")
print(f'Root Mean Squared Error (RMSE): {np.round(rmse,3)}')
print(f"Mean Squared Error: {np.round(mse,3)}")
print(f"Mean Absolute Error: {np.round(mae,3)}")
print(f"R2 Score: {np.round(r2,3)}")

elastic_score = {
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'R2': r2
    }
Best Elastic Net Model:
Best alpha: 0.001
Best l1_ratio: 0.1
Root Mean Squared Error (RMSE): 0.953
Mean Squared Error: 0.908
Mean Absolute Error: 0.653
R2 Score: 0.978
In [67]:
elastic_base_feature_importance = plot_feature_importance(best_elastic_net_model,X_train,20)
In [68]:
elastic_base_feature_importance[:20]
Out[68]:
Feature Importance
0 ema_9 1.613307
1 sma_5 1.518359
2 macd 1.493389
3 close_3d_avg 1.290926
4 sma_10 1.289912
5 macd_signal 1.253588
6 adj close_3d_avg 1.239816
7 low_1d_ago 0.982940
8 sma_15 0.923173
9 high_1d_ago 0.815800
10 close_3d_ago 0.765736
11 open_1d_ago 0.631358
12 sma_30 0.586684
13 low_3d_avg 0.580421
14 adj close_3d_ago 0.544949
15 open_3d_ago 0.531747
16 rsi 0.485403
17 high_3d_avg 0.431768
18 close_1d_ago 0.425664
19 low_15d_avg 0.399066
In [69]:
prediction_df['elastic_pred_base'] = elastic_pred_base

prediction_df.head()
Out[69]:
date y_test lr_pred_base lr_pred20 lr_pred15 lr_pred10 ridge_pred_base ridge_pred20 lasso_pred_base lasso_pred20 elastic_pred_base
1729 2020-01-02 54.240002 54.157799 54.239022 54.554907 54.590333 54.205840 54.349344 54.496024 54.501734 54.444972
1730 2020-01-03 54.150002 54.553547 54.520826 54.558027 54.511431 54.542331 54.381596 54.190970 54.262946 54.198628
1731 2020-01-06 53.919998 54.336899 54.065422 54.148986 54.126188 54.345656 54.233187 53.975755 54.001497 54.011290
1732 2020-01-07 54.049999 53.907121 54.067429 53.903359 53.992480 53.873857 54.036503 53.828707 53.859163 53.818335
1733 2020-01-08 54.189999 54.192608 53.987340 53.942897 54.081508 54.195624 54.059649 54.002350 53.989285 54.037049

6.b. Elastic Net with Top 20 Features¶

In [70]:
keep_cols20 = elastic_base_feature_importance[:20]['Feature'].tolist()

X_train20 = X_train[keep_cols20]
X_test20 = X_test[keep_cols20]

scaler = StandardScaler()
X_train_scaled20 = scaler.fit_transform(X_train20)
X_test_scaled20 = scaler.transform(X_test20)

# Train model
elastic_model20 = ElasticNet(alpha=0.001,l1_ratio =  0.9)
elastic_model20.fit(X_train_scaled20, y_train)


# Make predictions on the scaled test set
elastic_pred20 = elastic_model20.predict(X_test_scaled20)
elastic_score20 = evaluate_regression_model(y_test, elastic_pred20)
Mean Squared Error (MSE): 0.948
Root Mean Squared Error (RMSE): 0.974
Mean Absolute Error (MAE): 0.667
R-squared (R2): 0.977
In [71]:
plot_feature_importance(elastic_model20,X_train20,20)
Out[71]:
Feature Importance
0 ema_9 3.518012
1 close_3d_avg 2.975563
2 low_15d_avg 1.164547
3 sma_30 1.008792
4 low_1d_ago 0.912273
5 high_1d_ago 0.840268
6 sma_10 0.826143
7 adj close_3d_avg 0.783827
8 macd_signal 0.701467
9 macd 0.690383
10 sma_15 0.565839
11 adj close_3d_ago 0.556647
12 rsi 0.506954
13 close_3d_ago 0.408292
14 open_3d_ago 0.238611
15 high_3d_avg 0.039417
16 low_3d_avg 0.000000
17 sma_5 0.000000
18 close_1d_ago 0.000000
19 open_1d_ago 0.000000
In [72]:
prediction_df['elastic_pred20'] = elastic_pred20

prediction_df.head()
Out[72]:
date y_test lr_pred_base lr_pred20 lr_pred15 lr_pred10 ridge_pred_base ridge_pred20 lasso_pred_base lasso_pred20 elastic_pred_base elastic_pred20
1729 2020-01-02 54.240002 54.157799 54.239022 54.554907 54.590333 54.205840 54.349344 54.496024 54.501734 54.444972 54.503795
1730 2020-01-03 54.150002 54.553547 54.520826 54.558027 54.511431 54.542331 54.381596 54.190970 54.262946 54.198628 54.263269
1731 2020-01-06 53.919998 54.336899 54.065422 54.148986 54.126188 54.345656 54.233187 53.975755 54.001497 54.011290 54.055709
1732 2020-01-07 54.049999 53.907121 54.067429 53.903359 53.992480 53.873857 54.036503 53.828707 53.859163 53.818335 53.920259
1733 2020-01-08 54.189999 54.192608 53.987340 53.942897 54.081508 54.195624 54.059649 54.002350 53.989285 54.037049 54.058664

7. Model Comparison¶

In [73]:
ela_df = pd.DataFrame([elastic_score.keys(),elastic_score.values()])
ela_df.columns = ela_df.iloc[0]
ela_df = ela_df[1:].reset_index(drop=True)
ela_df['Model'] = 'Elastic_Net with All Features'

ela_20_df = pd.DataFrame([elastic_score20.keys(),elastic_score20.values()])
ela_20_df.columns = ela_20_df.iloc[0]
ela_20_df = ela_20_df[1:].reset_index(drop=True)
ela_20_df['Model'] = 'Elastic_Net with Top 20 Features'

lasso_df = pd.DataFrame([lasso_score.keys(),lasso_score.values()])
lasso_df.columns = lasso_df.iloc[0]
lasso_df = lasso_df[1:].reset_index(drop=True)
lasso_df['Model'] = 'Lasso with All Features'

lasso_20_df = pd.DataFrame([lasso_score20.keys(),lasso_score20.values()])
lasso_20_df.columns = lasso_20_df.iloc[0]
lasso_20_df = lasso_20_df[1:].reset_index(drop=True)
lasso_20_df['Model'] = 'Lasso with Top 20 Features'

ridge_df = pd.DataFrame([ridge_score.keys(),ridge_score.values()])
ridge_df.columns = ridge_df.iloc[0]
ridge_df = ridge_df[1:].reset_index(drop=True)
ridge_df['Model'] = 'Ridge with All Features'

ridge_20_df = pd.DataFrame([ridge_score20.keys(),ridge_score20.values()])
ridge_20_df.columns = ridge_20_df.iloc[0]
ridge_20_df = ridge_20_df[1:].reset_index(drop=True)
ridge_20_df['Model'] = 'Ridge with Top 20 Features'

lr_base_df = pd.DataFrame([lr_score_base.keys(),lr_score_base.values()])
lr_base_df.columns = lr_base_df.iloc[0]
lr_base_df = lr_base_df[1:].reset_index(drop=True)
lr_base_df['Model'] = 'Linear Reg. with All Features'

lr_20_df = pd.DataFrame([lr_score20.keys(),lr_score20.values()])
lr_20_df.columns = lr_20_df.iloc[0]
lr_20_df = lr_20_df[1:].reset_index(drop=True)
lr_20_df['Model'] = 'Linear Reg. with Top 20 Features'

lr_15_df = pd.DataFrame([lr_score15.keys(),lr_score15.values()])
lr_15_df.columns = lr_15_df.iloc[0]
lr_15_df = lr_15_df[1:].reset_index(drop=True)
lr_15_df['Model'] = 'Linear Reg. with Top 15 Features'

lr_10_df = pd.DataFrame([lr_score10.keys(),lr_score10.values()])
lr_10_df.columns = lr_10_df.iloc[0]
lr_10_df = lr_10_df[1:].reset_index(drop=True)
lr_10_df['Model'] = 'Linear Reg. with Top 10 Features'

df_compare = pd.concat([ela_df,lasso_df,ridge_df,ela_20_df,lasso_20_df,ridge_20_df,
                        lr_base_df,lr_20_df,lr_15_df,lr_10_df]).sort_values(by=['R2'],ascending=False).reset_index(drop=True)

df_compare
Out[73]:
MSE RMSE MAE R2 Model
0 0.729511 0.854114 0.585601 0.982168 Ridge with Top 20 Features
1 0.758611 0.870983 0.605987 0.981457 Ridge with All Features
2 0.768289 0.876521 0.613346 0.98122 Linear Reg. with Top 20 Features
3 0.77558 0.88067 0.613905 0.981042 Linear Reg. with All Features
4 0.786499 0.886848 0.626022 0.980775 Linear Reg. with Top 10 Features
5 0.796893 0.892688 0.626154 0.980521 Linear Reg. with Top 15 Features
6 0.90796 0.952869 0.652726 0.977806 Elastic_Net with All Features
7 0.940304 0.969693 0.663218 0.977015 Lasso with All Features
8 0.948047 0.973677 0.664517 0.976826 Lasso with Top 20 Features
9 0.948242 0.973777 0.666597 0.976821 Elastic_Net with Top 20 Features

After retraining the models with different alpha and input features, Ridge regression model with alpha 0.001 and all features performed best among others.

  • Mean Squared Error (MSE) 0.729511:

MSE measures the average squared difference between predicted and actual values. In this case, the MSE of 0.729511 is relatively low, indicating that, on average, the squared errors between predicted and actual values are small. Lower MSE values suggest better accuracy.

  • Root Mean Squared Error (RMSE) 0.854114:

RMSE is the square root of the MSE and provides a measure of the average magnitude of the errors. A lower RMSE (0.854114) signifies that, on average, the model's predictions are close to the actual values. It is in the same unit as the target variable.

  • Mean Absolute Error (MAE) 0.585601:

MAE measures the average absolute difference between predicted and actual values. With an MAE of 0.585601, the model's predictions, on average, deviate by approximately 0.60886 units from the actual values. Lower MAE values indicate better accuracy.

  • R-squared (R2) 0.982168:

R2 represents the proportion of variance in the target variable that is predictable from the independent variables. An R2 value of 0.982168 is exceptionally high, indicating that the model explains about 98.21% of the variance in the closing stock prices. A higher R2 value suggests a better accuracy.

In summary, the provided accuracy scores collectively suggest that the model performs exceptionally well. The low MSE, RMSE, MAE and high R2 score indicate that the model's predictions are close to the actual values.

In [74]:
prediction_df
Out[74]:
date y_test lr_pred_base lr_pred20 lr_pred15 lr_pred10 ridge_pred_base ridge_pred20 lasso_pred_base lasso_pred20 elastic_pred_base elastic_pred20
1729 2020-01-02 54.240002 54.157799 54.239022 54.554907 54.590333 54.205840 54.349344 54.496024 54.501734 54.444972 54.503795
1730 2020-01-03 54.150002 54.553547 54.520826 54.558027 54.511431 54.542331 54.381596 54.190970 54.262946 54.198628 54.263269
1731 2020-01-06 53.919998 54.336899 54.065422 54.148986 54.126188 54.345656 54.233187 53.975755 54.001497 54.011290 54.055709
1732 2020-01-07 54.049999 53.907121 54.067429 53.903359 53.992480 53.873857 54.036503 53.828707 53.859163 53.818335 53.920259
1733 2020-01-08 54.189999 54.192608 53.987340 53.942897 54.081508 54.195624 54.059649 54.002350 53.989285 54.037049 54.058664
... ... ... ... ... ... ... ... ... ... ... ... ...
2694 2023-11-01 67.970001 67.251327 66.691627 66.293326 66.511564 67.298025 66.970336 66.962254 67.016646 67.047164 67.123766
2695 2023-11-02 68.820000 68.295223 67.635666 67.398789 67.681439 68.052648 67.865085 67.502858 67.485406 67.584452 67.672327
2696 2023-11-03 68.239998 68.864264 68.759090 68.689217 68.943305 68.811739 68.806564 68.155329 68.270610 68.171208 68.378378
2697 2023-11-06 68.489998 68.041446 68.134383 68.593341 68.842344 68.125227 68.286008 68.186065 68.384744 68.222377 68.472590
2698 2023-11-07 69.019997 68.239220 68.843554 69.136339 69.178896 68.336189 68.670691 68.368078 68.539077 68.463381 68.531877

970 rows × 12 columns

In [75]:
plt.figure(figsize=(20, 10))
sns.lineplot(x = prediction_df.date, y=prediction_df.y_test,label='y_test')

sns.lineplot(x = prediction_df.date, y=prediction_df.lr_pred_base,label='lr_pred_base')
sns.lineplot(x = prediction_df.date, y=prediction_df.lr_pred20,label='lr_pred20')
sns.lineplot(x = prediction_df.date, y=prediction_df.lr_pred15,label='lr_pred15')
sns.lineplot(x = prediction_df.date, y=prediction_df.lr_pred10,label='lr_pred10')

sns.lineplot(x = prediction_df.date, y=prediction_df.ridge_pred_base,label='ridge_pred_base')
sns.lineplot(x = prediction_df.date, y=prediction_df.ridge_pred20,label='ridge_pred20')

sns.lineplot(x = prediction_df.date, y=prediction_df.lasso_pred_base,label='lasso_pred_base')
sns.lineplot(x = prediction_df.date, y=prediction_df.lasso_pred20,label='lasso_pred20')

sns.lineplot(x = prediction_df.date, y=prediction_df.elastic_pred_base,label='elastic_pred_base')
sns.lineplot(x = prediction_df.date, y=prediction_df.elastic_pred20,label='elastic_pred20')

plt.legend(prop={'size': 14, 'weight': 'bold'})
plt.title('Model Prediction Comparison', fontsize=16)
plt.ylabel('Prediction', fontsize=14)
plt.xlabel('Date', fontsize=14)

plt.show()
In [76]:
plt.figure(figsize=(20, 10))
sns.lineplot(x = prediction_df.date, y=prediction_df.y_test,label='y_test')

sns.lineplot(x = prediction_df.date, y=prediction_df.lr_pred_base,label='lr_pred_base')
sns.lineplot(x = prediction_df.date, y=prediction_df.lr_pred20,label='lr_pred20')
sns.lineplot(x = prediction_df.date, y=prediction_df.lr_pred15,label='lr_pred15')
sns.lineplot(x = prediction_df.date, y=prediction_df.lr_pred10,label='lr_pred10')

plt.legend(prop={'size': 14, 'weight': 'bold'})
plt.title('Model Prediction Comparison', fontsize=16)
plt.ylabel('Prediction', fontsize=14)
plt.xlabel('Date', fontsize=14)

plt.show()
In [77]:
plt.figure(figsize=(20, 10))
sns.lineplot(x = prediction_df.date, y=prediction_df.y_test,label='y_test')

sns.lineplot(x = prediction_df.date, y=prediction_df.ridge_pred_base,label='ridge_pred_base')
sns.lineplot(x = prediction_df.date, y=prediction_df.ridge_pred20,label='ridge_pred20')

plt.legend(prop={'size': 14, 'weight': 'bold'})
plt.title('Model Prediction Comparison', fontsize=16)
plt.ylabel('Prediction', fontsize=14)
plt.xlabel('Date', fontsize=14)

plt.show()
In [78]:
plt.figure(figsize=(20, 10))
sns.lineplot(x = prediction_df.date, y=prediction_df.y_test,label='y_test')

sns.lineplot(x = prediction_df.date, y=prediction_df.lasso_pred_base,label='lasso_pred_base')
sns.lineplot(x = prediction_df.date, y=prediction_df.lasso_pred20,label='lasso_pred20')

plt.legend(prop={'size': 14, 'weight': 'bold'})
plt.title('Model Prediction Comparison', fontsize=16)
plt.ylabel('Prediction', fontsize=14)
plt.xlabel('Date', fontsize=14)

plt.show()
In [79]:
plt.figure(figsize=(20, 10))
sns.lineplot(x = prediction_df.date, y=prediction_df.y_test,label='y_test')

sns.lineplot(x = prediction_df.date, y=prediction_df.elastic_pred_base,label='elastic_pred_base')
sns.lineplot(x = prediction_df.date, y=prediction_df.elastic_pred20,label='elastic_pred20')

plt.legend(prop={'size': 14, 'weight': 'bold'})
plt.title('Model Prediction Comparison', fontsize=16)
plt.ylabel('Prediction', fontsize=14)
plt.xlabel('Date', fontsize=14)

plt.show()

7.a Final Best Model¶

In [80]:
# target column is next day's close price
y_train = train_df['close_1d_next'].copy()
X_train = train_df.drop(['close_1d_next'], 1)

# target column is next day's close price
y_test  = test_df['close_1d_next'].copy()
X_test  = test_df.drop(['close_1d_next'], 1)
In [81]:
ridge_20_features = ridge_base_feature_importance[:20]['Feature'].tolist()
X_train = X_train[ridge_20_features]
X_test = X_test[ridge_20_features]
In [82]:
def train_ridge_regression(X_train,X_test,y_train,y_test):
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Train model
    ridge_model = Ridge(alpha=0.001)
    ridge_model.fit(X_train_scaled, y_train)


    # Make predictions on the scaled test set
    ridge_pred = ridge_model.predict(X_test_scaled)
    ridge_score = evaluate_regression_model2(y_test, ridge_pred)

    return ridge_model,ridge_pred,ridge_score
In [83]:
ridge_model, ridge_pred, ridge_score = train_ridge_regression(X_train,X_test,y_train,y_test)
In [84]:
ridge_score
Out[84]:
{'MSE': 0.7295114881722916,
 'RMSE': 0.8541144467647714,
 'MAE': 0.5856010765186319,
 'R2': 0.9821678541358965}
In [85]:
ridge_pred[:15]
Out[85]:
array([54.34934416, 54.38159609, 54.23318706, 54.03650291, 54.05964943,
       54.17549586, 54.25017275, 54.54902816, 54.18781481, 54.74202716,
       55.14666847, 55.18770245, 55.53205975, 55.4774224 , 55.51006433])
In [86]:
plot_regression_accuracy(y_test, ridge_pred)
In [87]:
plot_predictions(df,ridge_pred)
In [88]:
plot_feature_importance(ridge_model,X_train,20)
Out[88]:
Feature Importance
0 sma_5 25.441510
1 close_5d_avg 22.435160
2 adj close_5d_avg 11.490239
3 adj close_1d_ago 5.671591
4 adj close_10d_avg 5.507894
5 close_5d_ago 5.147037
6 close_1d_ago 4.554521
7 close_10d_avg 4.509604
8 close_15d_avg 3.719596
9 low_5d_avg 3.022988
10 sma_15 2.915010
11 ema_9 2.899160
12 low_10d_avg 1.985008
13 high_5d_avg 1.745675
14 open_10d_avg 1.590243
15 open_5d_avg 1.561036
16 open_15d_avg 1.264441
17 close_3d_ago 0.798379
18 close_7d_avg 0.640924
19 high_30d_avg 0.200250

The residual, scatter, and time series line charts above clearly show that the predicted values are very close to the actual values. These visualizations confirm that the model is very good at making accurate predictions, highlighting its strong performance and reliability in understanding the details of the data.

8. Train Multiple Different Stocks with Ridge Regression Model¶

In [89]:
def preprocess_data(df):
    df['ema_9'] = df['close'].ewm(9).mean().shift()
    df['sma_5'] = df['close'].rolling(5).mean().shift()
    df['sma_10'] = df['close'].rolling(10).mean().shift()
    df['sma_15'] = df['close'].rolling(15).mean().shift()
    df['sma_30'] = df['close'].rolling(30).mean().shift()
    df['rsi'] = rsi(df) #.fillna(0)
    df['mfi'] = mfi(df, 14)
    EMA_12 = pd.Series(df['close'].ewm(span=12, min_periods=12).mean())
    EMA_26 = pd.Series(df['close'].ewm(span=26, min_periods=26).mean())
    df['macd'] = pd.Series(EMA_12 - EMA_26)
    df['macd_signal'] = pd.Series(df.macd.ewm(span=9, min_periods=9).mean())
    df['close_1d_next'] = df['close'].shift(-1)

    df['close_1d_ago'] = df['close'].shift(1)
    df['close_3d_ago'] = df['close'].shift(3)
    df['close_5d_ago'] = df['close'].shift(5)
    df['close_1w_ago'] = df['close'].shift(7)
    df['close_2w_ago'] = df['close'].shift(14)
    df['close_3w_ago'] = df['close'].shift(21)
    df['close_4w_ago'] = df['close'].shift(28)

    df['adj_close_1d_ago'] = df['adj close'].shift(1)
    df['adj_close_3d_ago'] = df['adj close'].shift(3)
    df['adj_close_5d_ago'] = df['adj close'].shift(5)
    df['adj_close_1w_ago'] = df['adj close'].shift(7)
    df['adj_close_2w_ago'] = df['adj close'].shift(14)
    df['adj_close_3w_ago'] = df['adj close'].shift(21)
    df['adj_close_4w_ago'] = df['adj close'].shift(28)

    df['open_1d_ago'] = df['open'].shift(1)
    df['open_3d_ago'] = df['open'].shift(3)
    df['open_5d_ago'] = df['open'].shift(5)
    df['open_1w_ago'] = df['open'].shift(7)
    df['open_2w_ago'] = df['open'].shift(14)
    df['open_3w_ago'] = df['open'].shift(21)
    df['open_4w_ago'] = df['open'].shift(28)

    df['high_1d_ago'] = df['high'].shift(1)
    df['high_3d_ago'] = df['high'].shift(3)
    df['high_5d_ago'] = df['high'].shift(5)
    df['high_1w_ago'] = df['high'].shift(7)
    df['high_2w_ago'] = df['high'].shift(14)
    df['high_3w_ago'] = df['high'].shift(21)
    df['high_4w_ago'] = df['high'].shift(28)

    df['low_1d_ago'] = df['low'].shift(1)
    df['low_3d_ago'] = df['low'].shift(3)
    df['low_5d_ago'] = df['low'].shift(5)
    df['low_1w_ago'] = df['low'].shift(7)
    df['low_2w_ago'] = df['low'].shift(14)
    df['low_3w_ago'] = df['low'].shift(21)
    df['low_4w_ago'] = df['low'].shift(28)

    df['volume_1d_ago'] = df['volume'].shift(1)
    df['volume_3d_ago'] = df['volume'].shift(3)
    df['volume_5d_ago'] = df['volume'].shift(5)
    df['volume_1w_ago'] = df['volume'].shift(7)
    df['volume_2w_ago'] = df['volume'].shift(14)
    df['volume_3w_ago'] = df['volume'].shift(21)
    df['volume_4w_ago'] = df['volume'].shift(28)

    df['open_3d_avg'] = df['open'].rolling(window=3).mean()
    df['open_5d_avg'] = df['open'].rolling(window=5).mean()
    df['open_7d_avg'] = df['open'].rolling(window=7).mean()
    df['open_10d_avg'] = df['open'].rolling(window=10).mean()
    df['open_15d_avg'] = df['open'].rolling(window=15).mean()
    df['open_30d_avg'] = df['open'].rolling(window=30).mean()

    df['high_3d_avg'] = df['high'].rolling(window=3).mean()
    df['high_5d_avg'] = df['high'].rolling(window=5).mean()
    df['high_7d_avg'] = df['high'].rolling(window=7).mean()
    df['high_10d_avg'] = df['high'].rolling(window=10).mean()
    df['high_15d_avg'] = df['high'].rolling(window=15).mean()
    df['high_30d_avg'] = df['high'].rolling(window=30).mean()

    df['low_3d_avg'] = df['low'].rolling(window=3).mean()
    df['low_5d_avg'] = df['low'].rolling(window=5).mean()
    df['low_7d_avg'] = df['low'].rolling(window=7).mean()
    df['low_10d_avg'] = df['low'].rolling(window=10).mean()
    df['low_15d_avg'] = df['low'].rolling(window=15).mean()
    df['low_30d_avg'] = df['low'].rolling(window=30).mean()

    df['volume_3d_avg'] = df['volume'].rolling(window=3).mean()
    df['volume_5d_avg'] = df['volume'].rolling(window=5).mean()
    df['volume_7d_avg'] = df['volume'].rolling(window=7).mean()
    df['volume_10d_avg'] = df['volume'].rolling(window=10).mean()
    df['volume_15d_avg'] = df['volume'].rolling(window=15).mean()
    df['volume_30d_avg'] = df['volume'].rolling(window=30).mean()

    df['adj_close_3d_avg'] = df['adj close'].rolling(window=3).mean()
    df['adj_close_5d_avg'] = df['adj close'].rolling(window=5).mean()
    df['adj_close_7d_avg'] = df['adj close'].rolling(window=7).mean()
    df['adj_close_10d_avg'] = df['adj close'].rolling(window=10).mean()
    df['adj_close_15d_avg'] = df['adj close'].rolling(window=15).mean()
    df['adj_close_30d_avg'] = df['adj close'].rolling(window=30).mean()
    
    return df
In [90]:
df_all = pd.read_parquet(out_loc+"stock_1d.parquet")
df_all.columns = df_all.columns.str.lower()
In [91]:
### keep stocks in data with min year 2013, max year 2023
stock_min_dt = pd.DataFrame(df_all.groupby('symbol')['date'].min()).reset_index().rename(columns={'date':'min_date'})
stock_max_dt = pd.DataFrame(df_all.groupby('symbol')['date'].max()).reset_index().rename(columns={'date':'max_date'})
stock_cnt_dt = pd.DataFrame(df_all.groupby('symbol')['date'].count()).reset_index().rename(columns={'date':'days_cnt'})

stock_cnt = stock_min_dt.merge(stock_max_dt,on='symbol').merge(stock_cnt_dt,on='symbol')
stock_cnt['min_year'] = stock_cnt['min_date'].dt.year
stock_cnt['max_year'] = stock_cnt['max_date'].dt.year

keep_stocks = stock_cnt[(stock_cnt['min_year']==2013)&(stock_cnt['max_year']==2023)&(stock_cnt['days_cnt']>=2500)]['symbol'].unique().tolist()

stock_cnt.head()
Out[91]:
symbol min_date max_date days_cnt min_year max_year
0 A 2013-01-02 2023-11-08 2733 2013 2023
1 AAL 2013-01-02 2023-11-08 2733 2013 2023
2 AAPL 2013-01-02 2023-11-08 2733 2013 2023
3 ABBV 2013-01-02 2023-11-08 2733 2013 2023
4 ABNB 2020-12-10 2023-11-08 733 2020 2023
In [92]:
df_2023 = df_all[(df_all.date.dt.year==2023) & (df_all.symbol.isin(keep_stocks))]
# volume vs stocks
volume_2023 = pd.DataFrame(df_2023.groupby(['symbol','security','gics sector'])['volume'].sum()).reset_index()
volume_2023 = volume_2023.sort_values(by='volume',ascending=False).reset_index(drop=True)
volume_2023.head()
Out[92]:
symbol security gics sector volume
0 TSLA Tesla, Inc. Consumer Discretionary 3.009291e+10
1 AMD AMD Information Technology 1.342035e+10
2 AMZN Amazon Consumer Discretionary 1.305160e+10
3 AAPL Apple Inc. Information Technology 1.303964e+10
4 F Ford Motor Company Consumer Discretionary 1.278319e+10
In [93]:
# volume vs sectors
sector_2023 = pd.DataFrame(df_2023.groupby(['gics sector'])['volume'].sum()).reset_index()
sector_2023 = sector_2023.sort_values(by='volume',ascending=False).reset_index(drop=True)
sector_2023
Out[93]:
gics sector volume
0 Consumer Discretionary 9.171407e+10
1 Information Technology 8.888840e+10
2 Financials 6.728113e+10
3 Communication Services 5.267892e+10
4 Health Care 3.755560e+10
5 Industrials 3.672492e+10
6 Energy 3.245171e+10
7 Consumer Staples 2.824873e+10
8 Utilities 2.214882e+10
9 Materials 1.432867e+10
10 Real Estate 1.318748e+10
In [94]:
# filter top 5 sectors with highest volume in 2023
sector_list = sector_2023[:5]['gics sector'].tolist()

stock_list = []

num_stocks = 5
# stocks with highest volume in each sector
for sec in sector_list:
    stock_list.append(volume_2023[volume_2023['gics sector']==sec]['symbol'][:num_stocks].tolist())
    
    
stock_list = [item for sublist in stock_list for item in sublist]

len(stock_list)
Out[94]:
25
In [95]:
df_stocks = df_all[df_all['symbol'].isin(stock_list)].reset_index(drop=True)
df_stocks.head()
Out[95]:
date open high low close adj close volume symbol security gics sector gics sub-industry headquarters location date added cik founded
0 2013-01-02 18.003504 18.193193 17.931683 18.099348 18.099348 101550348.0 GOOGL Alphabet Inc. (Class A) Communication Services Interactive Media & Services Mountain View, California 2014-04-03 1652044 1998
1 2013-01-03 18.141392 18.316566 18.036036 18.109859 18.109859 92635272.0 GOOGL Alphabet Inc. (Class A) Communication Services Interactive Media & Services Mountain View, California 2014-04-03 1652044 1998
2 2013-01-04 18.251753 18.555305 18.210211 18.467718 18.467718 110429460.0 GOOGL Alphabet Inc. (Class A) Communication Services Interactive Media & Services Mountain View, California 2014-04-03 1652044 1998
3 2013-01-07 18.404655 18.503002 18.282784 18.387136 18.387136 66161772.0 GOOGL Alphabet Inc. (Class A) Communication Services Interactive Media & Services Mountain View, California 2014-04-03 1652044 1998
4 2013-01-08 18.406906 18.425926 18.128880 18.350851 18.350851 66976956.0 GOOGL Alphabet Inc. (Class A) Communication Services Interactive Media & Services Mountain View, California 2014-04-03 1652044 1998
In [96]:
stock_compare = []

for stock in stock_list:
        stock_data = df_stocks[df_stocks['symbol'] == stock]
        stock_data = preprocess_data(stock_data)
        stock_data = stock_data.dropna().reset_index(drop=True)
        
                # Split the DataFrame into training and testing sets
        train_df_temp = stock_data[stock_data.date.dt.year<2020]
        test_df_temp = stock_data[stock_data.date.dt.year>=2020]

        drop_cols1 = ['date','open','high','low','close','adj close','volume','symbol','security',
         'gics sector','gics sub-industry','headquarters location','date added','cik','founded']

        train_df_temp = train_df_temp.drop(drop_cols1, 1)

        test_df_temp  = test_df_temp.drop(drop_cols1, 1)

        # target column is next day's close price
        y_train_temp = train_df_temp['close_1d_next'].copy()
        X_train_temp = train_df_temp.drop(['close_1d_next'], 1)

        # target column is next day's close price
        y_test_temp  = test_df_temp['close_1d_next'].copy()
        X_test_temp  = test_df_temp.drop(['close_1d_next'], 1)
#        print(stock, len(X_train), len(X_test), len(y_train), len(y_test))
        temp_model, temp_pred, temp_score = train_ridge_regression(X_train_temp,X_test_temp,y_train_temp,y_test_temp)
        
        score_df = pd.DataFrame([temp_score.keys(),temp_score.values()])
        score_df.columns = score_df.iloc[0]
        score_df = score_df[1:].reset_index(drop=True)
        score_df['symbol'] = stock
        
        stock_compare.append(score_df)
        
compare_df = pd.concat(stock_compare).sort_values(by='R2',ascending=False).reset_index(drop =True)
In [97]:
compare_df
Out[97]:
MSE RMSE MAE R2 symbol
0 56.919689 7.544514 5.112978 0.995061 NVDA
1 0.559182 0.747785 0.529467 0.993251 VZ
2 7.770865 2.787627 2.107652 0.992581 AAPL
3 5.039335 2.244846 1.670374 0.992108 GOOG
4 4.98575 2.232879 1.654225 0.992002 GOOGL
5 46.381058 6.810364 4.690443 0.990596 META
6 2.057386 1.434359 1.020137 0.990386 CVS
7 0.147953 0.384647 0.271314 0.990362 F
8 1.27562 1.129434 0.839937 0.98989 GM
9 28.055837 5.296776 3.992424 0.989314 MSFT
10 84.835254 9.210606 6.497099 0.988764 TSLA
11 0.575533 0.758639 0.55086 0.988709 PFE
12 0.581909 0.76283 0.577919 0.988411 BAC
13 0.26222 0.512075 0.377068 0.988041 KEY
14 1.80572 1.343771 0.958788 0.987257 INTC
15 0.991949 0.995966 0.734726 0.987031 WFC
16 11.147927 3.338851 2.469883 0.985621 AMZN
17 0.174755 0.418037 0.281833 0.983153 T
18 10.145635 3.185221 2.328928 0.982675 AMD
19 2.409729 1.55233 1.141577 0.981822 C
20 0.124353 0.352637 0.259468 0.979688 HBAN
21 0.870412 0.932958 0.684029 0.978487 BMY
22 2.087087 1.444675 1.154954 0.96863 CCL
23 4.398395 2.097235 1.52279 0.967914 JNJ
24 4.070663 2.017589 1.528408 0.589411 VTRS

The final phase of the project involved applying the developed model to real-world scenarios. By identifying the top 5 industries with the highest volume in 2023, we ensured that our predictions were grounded in current market dynamics. The subsequent selection of 5 stocks within each industry added a layer of practicality to our findings.

The model's stellar performance on NVDA, AAPL, VZ, GOOG, and GOOGL proved its robustness in diverse market conditions. Simultaneously, the challenges encountered with VTRS opened up opportunities for further investigation into the factors contributing to its underperformance.

In [ ]: